I'm trying to read in a lot of data as efficiently as possible; the data are in about 1,400 CSV
files within 6 individual ZIP
files. The CSV
files are all similar timeseries data with the same columns, except some CSV
files have an extra column ("yield_c" in this example). I need to import all data and would prefer to not unzip the ZIP
files. I've had success with a few approaches but both have their problems; we'll call them A and B. Approach A seems to run much faster than B, but it is adding each CSV
header as a new data row and (likely related) is causing all columns to be read as character. Approach B processes the full dataset correctly but is abominably slow.
Any tips on coding this better for efficiency and a proper data pull? I think I am missing something simple about how freadr processes the cmd input but I can't figure out how to make approach A work correctly without adding the header rows every time.
Here is a sample dataset of "only" 240 CSV
files among 6 ZIP
files with my two data processor functions datprocessorA()
and datprocessorB()
. Currently (for me) approach A takes a few seconds and approach B takes 2.5 minutes.
Apologies for any formatting or etiquette issues, I don't post here as I can usually find a solution. Similarly, apologies if this is a dupe but I've scoured the forums and haven't found anything to help this specific case; plenty of related issues but not this one in particular.
library(data.table)
# Create sample CSV files and zip them in a folder
tempath <- "temp"
dir.create(file.path(tempath), showWarnings = FALSE)
setwd(tempath)
dfs <- list()
num_dfs <- 240
for (i in 1:num_dfs) {
data <- data.frame(date = seq(as.Date("1980-10-01"), as.Date("2020-09-01"), by = "month"),
Subset = paste0("Subset", i),
yield_a = sample(0:20,480, replace = TRUE),
yield_b = sample(0:5,480, replace = TRUE))
df_name <- paste0("df", i)
dfs[[df_name]] <- data
}
# randomly (ish) add "yield_c" column to a subset of dataframes
seqz <- sort(unique(c(1,sample(seq(1,num_dfs, by = 5), size = num_dfs%/%3, replace = TRUE))))
for (j in seqz) {
dfs[[j]]$yield_c = sample(1:10,480, replace = TRUE)
}
# write all to csv, zip up, delete csvs
for (k in seq_along(dfs)) {
file_name <- paste0("file", k, ".csv") # Create a unique filename
write.csv(dfs[[k]], file = file_name, row.names = FALSE) # Save to CSV
}
zipcount <- 6
for (l in 1:zipcount) {
zip(paste0("zip",l,".zip"), paste0("file", seq(num_dfs/zipcount*(l-1)+1,num_dfs/zipcount*l,1), ".csv"))
}
file.remove(list.files(pattern = "\\.csv$", full.names = TRUE))
# import
# return to parent directory to more closely imitate my scenario
setwd("..")
#get path for all zip files
zips <- paste0(tempath, "/", list.files(path = tempath, full.names = F, recursive = F))
# first function to import all csvs into R as a table
datprocessorA <- function(zipdir) {
connz <- paste("unzip -p",zipdir)
datz <- rbindlist(lapply(connz, function(x) fread(x, sep=',', header = TRUE,
stringsAsFactors=FALSE, fill = Inf)),
use.names = TRUE, fill=TRUE)
datz
}
# alternative function to import all csvs into R as a table
datprocessorB <- function(zipdir) {
datz <- lapply(setNames(nm = unzip(zipdir, list = TRUE)$Name),
function(fn) fread(cmd = paste("unzip -p -a",zipdir, fn), header = TRUE,
stringsAsFactors=FALSE, fill = Inf)) %>%
rbindlist(use.names = TRUE, fill=TRUE)
datz
}
# for loop to process import function; choose datprocessorA or datprocessorB
funx <- datprocessorA
begin <- Sys.time()
for (i in 1:length(zips)) {
zipdir <- zips[i]
if (i == 1) {
import <- funx(zipdir)
} else {
dat2 <- funx(zipdir)
import <- rbind(import, dat2, fill = TRUE)
rm(dat2)
}
}
end <- Sys.time()
print(paste0("Approach A length: ",round(end-begin), " seconds"))
funx <- datprocessorB
begin <- Sys.time()
for (i in 1:length(zips)) {
zipdir <- zips[i]
if (i == 1) {
import <- funx(zipdir)
} else {
dat2 <- funx(zipdir)
import <- rbind(import, dat2, fill = TRUE)
rm(dat2)
}
}
end <- Sys.time()
print(paste0("Approach B length: ",round(end-begin), " minutes"))
The following function seems to do what you want and is fast.
It reads all files in the zips passed in a vector zip_vector
, with no need for a for
loop.
datprocessorC <- function(zip_vector) {
f <- function(zipfile) {
fls <- unzip(zipfile, list = TRUE)$Name
lapply(fls, \(f) fread(unzip(zipfile, files = f))) |> rbindlist(fill = TRUE)
}
lapply(zip_vector, f) |> rbindlist(fill = TRUE)
}
# run on the zip files vector returned by list.files()
importC <- datprocessorC(zips)
Here is another function that reads the files with unz/readr::read_csv
without unzipping. No file1.csv
, file2.csv
, etc are created.
This function takes about twice the time of datprocessorC
to extract the data.
datprocessorD <- function(zip_vector) {
f <- function(zipfile) {
fls <- unzip(zipfile, list = TRUE)$Name
out <- vector("list", length(fls))
for(i in seq_along(fls)) {
tmp <- unz(zipfile, filename = fls[i])
out[[i]] <- readr::read_csv(file = tmp, show_col_types = FALSE)
}
data.table::rbindlist(out, fill = TRUE)
}
lapply(zip_vector, f) |> data.table::rbindlist(fill = TRUE)
}