razureloopsazure-table-storage

How to resolve paging when retrieving data from REST API in R


I'm using a REST API to retrieve data from an Azure Table using the code below:

library(httr)
library(RCurl)
library(bitops)
library(xml2)

# Stores credentials in variable

Account <- "storageaccount"
Container <- "Usage"
Key <- "key"


# Composes URL

URL <- paste0(
          "https://", 
          Account, 
          ".table.core.windows.net", 
          "/", 
          Container
        )

# Requests time stamp

requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")


# As per Microsoft's specs, an empty line is needed for content-length

content_lenght <- 0

# Composes signature string

signature_string <- paste0(
                     "GET", "\n",                 # HTTP Verb
                     "\n",                        # Content-MD-5
                     "text/xml", "\n",            # Content-Type
                     requestdate, "\n",           # Date
                     "/", Account, "/", Container # Canonicalized resource
                    )

# Composes header string

header_string <- add_headers(
                    Authorization=paste0(
                      "SharedKey ", 
                      Account, 
                      ":",
                      RCurl::base64(
                        digest::hmac(
                          key = RCurl::base64Decode(
                            Key, mode = "raw"
                          ),
                          object = enc2utf8(signature_string),
                          algo = "sha256", 
                          raw = TRUE
                            )
                        )
                    ),
                    'x-ms-date' = requestdate, 
                    'x-ms-version' = "2020-12-06",
                    'Content-type' = "text/xml"
                  )

# Creates request

xml_body = content(
                GET(
                  URL, 
                  config = header_string, 
                  verbose()
                  ),
                "text"
                )

Get_data <- xml_body                             # Gets data as text from API  
From_JSON <-fromJSON(Get_data, flatten = TRUE)   # Parses text from JSON
Table_name <- as.data.frame(From_JSON)           # Saves data to a table

I can now view the table, but I noted that I can only see the first 1000 rows. What's the most efficient way to implement a loop/cycle that retrieves all the remaining rows and updates the table?

I need to be able to work on the entire dataset.

Also consider that this table will be updated with ~40,000 rows per day, so keeping the visuals current with the data is a concern.


Solution

  • I figured it out... I improved the script and now it's working. :-)

    The code can be further refined, but this is a working script that retrieves all the data from the table in iterations.

    Connects to an Azure Table based on the specifications for Shared Key: https://learn.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key
    
    
    library(httr)
    library(RCurl)
    library(bitops)
    library(xml2)
    library(jsonlite)
    library(stringr)
    library(dplyr)
    library(tidyr)
    
    # Stores credentials in variable
    
    Account <- "storage"
    Container <- "Usage"
    Key <- "key"
    
    
    # Composes URL
    
    URL <- paste0(
              "https://", 
              Account, 
              ".table.core.windows.net", 
              "/", 
              Container
            )
    
    # Requests time stamp
    
    requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
    
    
    # As per Microsoft's specs, an empty line is needed for content-length
    
    content_lenght <- 0
    
    # Composes signature string
    
    signature_string <- paste0(
                         "GET", "\n",                 # HTTP Verb
                         "\n",                        # Content-MD-5
                         "text/xml", "\n",            # Content-Type
                         requestdate, "\n",           # Date
                         "/", Account, "/", Container # Canonicalized resource
                        )
    
    # Composes header string
    
    header_string <- add_headers(
                        Authorization=paste0(
                          "SharedKey ", 
                          Account, 
                          ":",
                          RCurl::base64(
                            digest::hmac(
                              key = RCurl::base64Decode(
                                Key, mode = "raw"
                              ),
                              object = enc2utf8(signature_string),
                              algo = "sha256", 
                              raw = TRUE
                                )
                            )
                        ),
                        'x-ms-date' = requestdate, 
                        'x-ms-version' = "2020-12-06",
                        'Content-type' = "text/xml"
                      )
    
    
    # Calls
    
    
    Get_headers <- capture.output(                                          # Retrieves metadata
                      content(
                        GET(
                          URL, 
                          config = header_string,
                          verbose()
                        )
                      ), 
                     type = "message"
                   )       
    
    Server_response <- Get_headers[11] %>%                                    # Retrieves server response
                             trimws( whitespace = "\r") %>% 
                                trimws( whitespace = "<- ") %>%
                                   grepl("HTTP/1.1 200 OK")
    
    Get_headers
    Server_response
    
    # Initializes counters
    
    Pages <- 0
    Next_headers_count <- 0
    
      while(isTRUE(Server_response)) {
        
            Pages <- Pages + 1
        
            xml_body <- content(                                       # Retrieves up to 1000 rows from the table
                            GET(
                              URL, 
                              config = header_string, 
                               verbose()
                             ), 
                            "text"
                          ) 
            
            Get_data <- xml_body                                      # Gets data as text from API  
            From_JSON <-fromJSON(Get_data, flatten = TRUE)            # Parses text from JSON
            Temp_table_name <- as.data.frame(From_JSON)               # Saves current rows to temp table
            Table_name <- bind_rows(Temp_table_name, Table_name)      # Appends new data to the initial data
        
            # Checks if there are more than 1000 rows to be fetched
        
            x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%                                              
                                                    trimws( whitespace = "<- ") %>%
                                                      gsub("\\.*", "x-ms-continuation-NextPartitionKey") %>%
                                                        grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
       
            x_ms_continuation_NextRowKey <- Get_headers[20] %>%
                                              trimws( whitespace = "<- ") %>% 
                                                 gsub("\\.*", "x-ms-continuation-NextRowKey") %>%
                                                   grepl("x-ms-continuation-NextRowKey", fixed = TRUE) 
            
            x_ms_continuation_NextPartitionKey
            x_ms_continuation_NextRowKey
            
              if (isTRUE(x_ms_continuation_NextPartitionKey) & 
                     isTRUE(x_ms_continuation_NextRowKey)) {
              
                     Next_headers_count <- Next_headers_count + 1
              
                     Next_Partition_Key <- Get_headers[19] %>%                    # Extracts the value of the Next Partition Key
                                              str_remove(".+(?= )") %>%       
                                                trimws( whitespace =" ") %>%
                                                  trimws( whitespace = "\r")
                 
                     Next_Row_key <- Get_headers[20] %>%                          # Extracts the value of the Next Row Key
                                         str_remove(".+(?= )") %>% 
                                            trimws( whitespace =" ") %>%
                                               trimws( whitespace = "\r")
              
                     URL <- paste0(                                               # Creates the URL for the Next Authentication token
                             "https://", 
                              Account, 
                             ".table.core.windows.net", 
                             "/", 
                             Container, 
                             "?", 
                            "NextPartitionKey=", 
                             Next_Partition_Key, 
                             "&NextRowKey=",
                             Next_Row_key
                            )
               
                   
                      Get_headers <- capture.output(                                          # Retrieves new metadata
                                           content(
                                                 GET(
                                                   URL, 
                                                   config = header_string,
                                                   verbose()
                                                 )
                                              ), 
                                            type = "message"
                                       )       
    
                       Server_response <- Get_headers[11] %>%                                    # Retrieves new server response
                                              trimws( whitespace = "\r") %>% 
                                                 trimws( whitespace = "<- ") %>%
                                                     grepl("HTTP/1.1 200 OK")
              }
            
      }
    
    Pages
    Next_headers_count
    View(Table_name)