I'm using a REST API to retrieve data from an Azure Table using the code below:
library(httr)
library(RCurl)
library(bitops)
library(xml2)
# Stores credentials in variable
Account <- "storageaccount"
Container <- "Usage"
Key <- "key"
# Composes URL
URL <- paste0(
"https://",
Account,
".table.core.windows.net",
"/",
Container
)
# Requests time stamp
requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
# As per Microsoft's specs, an empty line is needed for content-length
content_lenght <- 0
# Composes signature string
signature_string <- paste0(
"GET", "\n", # HTTP Verb
"\n", # Content-MD-5
"text/xml", "\n", # Content-Type
requestdate, "\n", # Date
"/", Account, "/", Container # Canonicalized resource
)
# Composes header string
header_string <- add_headers(
Authorization=paste0(
"SharedKey ",
Account,
":",
RCurl::base64(
digest::hmac(
key = RCurl::base64Decode(
Key, mode = "raw"
),
object = enc2utf8(signature_string),
algo = "sha256",
raw = TRUE
)
)
),
'x-ms-date' = requestdate,
'x-ms-version' = "2020-12-06",
'Content-type' = "text/xml"
)
# Creates request
xml_body = content(
GET(
URL,
config = header_string,
verbose()
),
"text"
)
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Table_name <- as.data.frame(From_JSON) # Saves data to a table
I can now view the table, but I noted that I can only see the first 1000 rows. What's the most efficient way to implement a loop/cycle that retrieves all the remaining rows and updates the table?
I need to be able to work on the entire dataset.
Also consider that this table will be updated with ~40,000 rows per day, so keeping the visuals current with the data is a concern.
I figured it out... I improved the script and now it's working. :-)
The code can be further refined, but this is a working script that retrieves all the data from the table in iterations.
Connects to an Azure Table based on the specifications for Shared Key: https://learn.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key
library(httr)
library(RCurl)
library(bitops)
library(xml2)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
# Stores credentials in variable
Account <- "storage"
Container <- "Usage"
Key <- "key"
# Composes URL
URL <- paste0(
"https://",
Account,
".table.core.windows.net",
"/",
Container
)
# Requests time stamp
requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
# As per Microsoft's specs, an empty line is needed for content-length
content_lenght <- 0
# Composes signature string
signature_string <- paste0(
"GET", "\n", # HTTP Verb
"\n", # Content-MD-5
"text/xml", "\n", # Content-Type
requestdate, "\n", # Date
"/", Account, "/", Container # Canonicalized resource
)
# Composes header string
header_string <- add_headers(
Authorization=paste0(
"SharedKey ",
Account,
":",
RCurl::base64(
digest::hmac(
key = RCurl::base64Decode(
Key, mode = "raw"
),
object = enc2utf8(signature_string),
algo = "sha256",
raw = TRUE
)
)
),
'x-ms-date' = requestdate,
'x-ms-version' = "2020-12-06",
'Content-type' = "text/xml"
)
# Calls
Get_headers <- capture.output( # Retrieves metadata
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message"
)
Server_response <- Get_headers[11] %>% # Retrieves server response
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
Get_headers
Server_response
# Initializes counters
Pages <- 0
Next_headers_count <- 0
while(isTRUE(Server_response)) {
Pages <- Pages + 1
xml_body <- content( # Retrieves up to 1000 rows from the table
GET(
URL,
config = header_string,
verbose()
),
"text"
)
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Temp_table_name <- as.data.frame(From_JSON) # Saves current rows to temp table
Table_name <- bind_rows(Temp_table_name, Table_name) # Appends new data to the initial data
# Checks if there are more than 1000 rows to be fetched
x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
x_ms_continuation_NextRowKey <- Get_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
x_ms_continuation_NextPartitionKey
x_ms_continuation_NextRowKey
if (isTRUE(x_ms_continuation_NextPartitionKey) &
isTRUE(x_ms_continuation_NextRowKey)) {
Next_headers_count <- Next_headers_count + 1
Next_Partition_Key <- Get_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
URL <- paste0( # Creates the URL for the Next Authentication token
"https://",
Account,
".table.core.windows.net",
"/",
Container,
"?",
"NextPartitionKey=",
Next_Partition_Key,
"&NextRowKey=",
Next_Row_key
)
Get_headers <- capture.output( # Retrieves new metadata
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message"
)
Server_response <- Get_headers[11] %>% # Retrieves new server response
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
}
}
Pages
Next_headers_count
View(Table_name)