I am reading in data from individual xlsx files, with the data stored in 10-20 thousand individual tabs in each workbook file. The first sheet contains a master data table, including links to the individual tabs with further data.
The column based 'tabbed' data is summarized and transposed before being appended to the master data.
The master data table is large (10' thousands rows x hundreds cols) in its own right, the additional data tabs are small in their own rights (a few cols by 10's to a few '00 rows).
Using XLConnect
package crashed out-of-memory already on calling loadWorkbook()
(R 3.4.0, RStudio 1.1.383, 64bit, 8G machine), otherwise I could work along the lines of this.
Because I need to load from individual tabs, I am currently using a nested for() loop to load each individual tab data. However, with my number of tabs this takes nearly a minute per loop putting the total execution time to nearly a week! Using a nested for() loop is also decidedly non-tidy, so I suspect there is a neater and (much) faster way to achieve this, but can't see it.
I have read in the links into a dedicated df (linkReferences
) in R.
The data source is not mine, so I am stuck with the input as provided.
The problem is purely related to the speed of reading the sheets, which appears to grow as the number of sheets in a file (and thus the file size) grows.
I am looking for any solution to speed this up, updated with self-contained minimum example.
On my pc: n = 10
gives time/sheet 0.16 sec, n = 100
~0.56 sec/sheet and n = 1000
~3 sec/sheet, which is similar to what i'm seeing in my real data (<10 sec/sheet for 16k sheets)
library(tidyverse)
number_of_sheets= 100
# =========================================================================
# CREATE SAMPLE FILE . Layout similar to actual data
library(openxlsx)
my.sheets.file <- "sampleXLSX.xlsx"
linkReferences <- data_frame( sheet = str_c("Data ",seq(1:number_of_sheets)) )
wb <- write.xlsx(linkReferences, file=my.sheets.file)
sample_header <-data.frame( head_name = c("head1", "head2","head3","head4","head5") ,
head_text = c("text1", "text2","text3","text4","text5") )
set.seed(31415)
for (i in 1:number_of_sheets) {
cat(i,"..")
sheet_name_i <- paste0("Data ",i)
addWorksheet(wb, sheetName = sheet_name_i)
writeData(wb, sheet=sheet_name_i, sample_header, startCol = "B", startRow=2)
n = ceiling( runif(1)*200 )
sample_data <- data_frame(A=seq(1:n),
B= runif(n),
C= sample(seq(1:5),n,replace=TRUE))
writeData(wb, sheet=sheet_name_i, sample_data, startCol = "B", startRow=10)
}
saveWorkbook(wb, file=my.sheets.file, overwrite=TRUE)
#===========================================================================
# THIS IS THE ACTUAL QUESTION
# Read from file with many tabs
library(readxl)
library(stringr)
linkReferences <- linkReferences %>%
mutate( Head1 = NA, Head2 = NA, Head3 = NA, Head4 = NA, Head5 = NA,
A.1 = NA, B.1 = NA, C.1 = NA,
A.2 = NA, B.2 = NA, C.2 = NA,
A.3 = NA, B.3 = NA, C.3 = NA,
A.4 = NA, B.4 = NA, C.4 = NA,
A.5 = NA, B.5 = NA, C.5 = NA
)
linkReferences.nrows = nrow(linkReferences)
lRnames <- names(linkReferences)
start.row=1
start_time <- Sys.time()
for (i in start.row:linkReferences.nrows){
cat("i=",i, " / ",linkReferences.nrows,"\n")
start_time_i=Sys.time()
linked_data <- read_xlsx(my.sheets.file,
sheet=as.character(linkReferences[i,"sheet"]),
skip=2,
col_types = c("text","text","text"),
col_names=FALSE)
print(Sys.time()-start_time_i) # This takes 99% of the loop time
linkReferences[i,2:6] <- unlist( linked_data[1:5,2])
data_head_row <- which( linked_data[,1]=="A")
names(linked_data) <- c("A","B","C")
linked_data <- linked_data[ (data_head_row+1):(nrow(linked_data)),]
# create a (rather random) sample summary
summary_linked_data <- linked_data%>%
group_by(C) %>%
summarise(B=last(B), A=last(A)) %>%
arrange(desc(C))
# not all data has the full range of options, so use actual number
summary_linked_data_nrows <- nrow(summary_linked_data)
#start_time_i2 <- Sys.time()
for( ii in 1:summary_linked_data_nrows) {
linkReferences[i, match(str_c("A.",ii),lRnames):match(str_c("C.",ii),lRnames)] <-
summary_linked_data[ii,]
}
#print(Sys.time()-start_time_i2)
print(linkReferences[i,2:20])
# ________________________________________________________
# BELOW IS ONLY FOR TEST LOOP TIMING STATS IN THIS EXAMPLE
delta_time <- Sys.time() - start_time
delta_time_attr <- attr(delta_time, "units")
row_time <- delta_time/(i-start.row+1)
if (delta_time_attr =="mins") {
row_time <- row_time*60
} else if( delta_time_attr == "hours") {
row_time <- row_time*3600
}
total_time <- row_time*(linkReferences.nrows-start.row-1)/3600
cat( "Passed time: ", delta_time, attr(delta_time, "units"),
" | time/row: ", round(row_time,2), "secs.",
" | Est total time:",
round(total_time*60,2), "mins = )",
round(total_time,2), "hours )",
"\n---------------\n")
}
# Conversion of data loaded as character to numeric can all happen outside loop once all data is loaded.
After some digging: XLConnect()
, with its vectorised sheet reading capability (see here), is the clear winner, provided you can your workbook in memory.
I had to a. reduce the size of my workbook, and b. set XLconnect memory to 4GB as per @Joshua's link here.
For the 1000 sheets example as per the question above:
wb <- loadWorkbook()
took 15 seconds,
linked_data_lst = readWorksheet()
took 34 seconds,
and the data extraction for (i in 1:nr_linked_data){...}
from the now in-memory list, took 86 seconds.
Giving a total time of 0.135 sec/sheet (22x faster than the code above)
#============================================================================
# now read it again
library(stringr)
options(java.parameters = "-Xmx4g" )
library(XLConnect)
linkReferences <- linkReferences %>%
mutate( Head1 = NA, Head2 = NA, Head3 = NA, Head4 = NA, Head5 = NA,
A.1 = NA, B.1 = NA, C.1 = NA,
A.2 = NA, B.2 = NA, C.2 = NA,
A.3 = NA, B.3 = NA, C.3 = NA,
A.4 = NA, B.4 = NA, C.4 = NA,
A.5 = NA, B.5 = NA, C.5 = NA
)
linkReferences.nrows = nrow(linkReferences)
lRnames <- names(linkReferences)
lRcols <- c(match(str_c("A.1"),lRnames):match(str_c("C.5"),lRnames))
lRheadCols <- c((lRcols[1]-5):(lRcols[1]-1))
start_time <- Sys.time()
wb <- loadWorkbook(my.sheets.file)
Sys.time() - start_time
start.row=1
end.row = linkReferences.nrows
start_time0 <- Sys.time()
linked_data_lst = readWorksheet(wb,
sheet=linkReferences[start.row:end.row,][["sheet"]],
startCol = 2,
endCol = 4,
startRow = 3,
header = FALSE)
delta_time <- (Sys.time() - start_time0) %>% print()
nr_linked_data <- length(linked_data_lst)
start_time <- Sys.time()
for (i in 1:nr_linked_data ) {
cat("i=",i, " / ",nr_linked_data,"\n")
linked_data <- as_tibble(linked_data_lst[[i]])
# EVERYTHING BELOW HERE IS EXACTLY SAME AS IN QUESTION CODE
# =========================================================
linkReferences[i,lRheadCols] <- unlist( linked_data[1:5,2])
data_head_row <- which( linked_data[,1]=="A")
names(linked_data) <- c("A","B","C")
linked_data <- linked_data[ (data_head_row+1):(nrow(linked_data)),]
linked_data <- linked_data %>% mutate_all( funs(as.numeric) )
# create a (rather random) sample summary
summary_linked_data <- linked_data%>%
group_by(C) %>%
summarise(B=last(B), A=last(A)) %>%
arrange(desc(C))
# not all data has the full range of options, so use actual number
summary_linked_data_nrows <- nrow(summary_linked_data)
#start_time_i2 <- Sys.time()
for( ii in 1:summary_linked_data_nrows) {
linkReferences[i, match(str_c("A.",ii),lRnames):match(str_c("C.",ii),lRnames)] <-
summary_linked_data[ii,]
}
#print(Sys.time()-start_time_i2)
print(linkReferences[i,lRheadCols[1]:max(lRcols)])
delta_time <- Sys.time() - start_time
delta_time_attr <- attr(delta_time, "units")
row_time <- delta_time/(i-start.row+1)
if (delta_time_attr =="mins") {
row_time <- row_time*60
} else if( delta_time_attr == "hours") {
row_time <- row_time*3600
}
total_time <- row_time*(linkReferences.nrows-start.row-1)/3600
cat( "Passed time: ", delta_time, attr(delta_time, "units"),
" | time/row: ", round(row_time,2), "secs.",
" | Est total time:",
round(total_time*60,2), "mins = )",
round(total_time,2), "hours )",
"\n---------------\n")
}