rshellcsvzipfread

How can I read multiple csv files from within multiple zip files with sometimes different columns using R fread?


I'm trying to read in a lot of data as efficiently as possible; the data are in about 1,400 CSV files within 6 individual ZIP files. The CSV files are all similar timeseries data with the same columns, except some CSV files have an extra column ("yield_c" in this example). I need to import all data and would prefer to not unzip the ZIP files. I've had success with a few approaches but both have their problems; we'll call them A and B. Approach A seems to run much faster than B, but it is adding each CSV header as a new data row and (likely related) is causing all columns to be read as character. Approach B processes the full dataset correctly but is abominably slow.

Any tips on coding this better for efficiency and a proper data pull? I think I am missing something simple about how freadr processes the cmd input but I can't figure out how to make approach A work correctly without adding the header rows every time.

Here is a sample dataset of "only" 240 CSV files among 6 ZIP files with my two data processor functions datprocessorA() and datprocessorB(). Currently (for me) approach A takes a few seconds and approach B takes 2.5 minutes.

Apologies for any formatting or etiquette issues, I don't post here as I can usually find a solution. Similarly, apologies if this is a dupe but I've scoured the forums and haven't found anything to help this specific case; plenty of related issues but not this one in particular.

library(data.table)

# Create sample CSV files and zip them in a folder

tempath <- "temp"
dir.create(file.path(tempath), showWarnings = FALSE)

setwd(tempath)

dfs <- list()
num_dfs <- 240

for (i in 1:num_dfs) {
  
  data <- data.frame(date = seq(as.Date("1980-10-01"), as.Date("2020-09-01"), by = "month"),
                     Subset = paste0("Subset", i),
                     yield_a = sample(0:20,480, replace = TRUE),
                     yield_b = sample(0:5,480, replace = TRUE))
  
  df_name <- paste0("df", i)
  
  dfs[[df_name]] <- data
}

# randomly (ish) add "yield_c" column to a subset of dataframes
seqz <- sort(unique(c(1,sample(seq(1,num_dfs, by = 5), size = num_dfs%/%3, replace = TRUE))))

for (j in seqz) {
  dfs[[j]]$yield_c = sample(1:10,480, replace = TRUE)
}

# write all to csv, zip up, delete csvs
for (k in seq_along(dfs)) {
  file_name <- paste0("file", k, ".csv") # Create a unique filename
  write.csv(dfs[[k]], file = file_name, row.names = FALSE) # Save to CSV
}

zipcount <- 6
for (l in 1:zipcount) {
  zip(paste0("zip",l,".zip"), paste0("file", seq(num_dfs/zipcount*(l-1)+1,num_dfs/zipcount*l,1), ".csv"))
}

file.remove(list.files(pattern = "\\.csv$", full.names = TRUE))
# import

# return to parent directory to more closely imitate my scenario
setwd("..")

#get path for all zip files
zips <- paste0(tempath, "/", list.files(path = tempath, full.names = F, recursive = F))

# first function to import all csvs into R as a table

datprocessorA <- function(zipdir) {
  
  connz <- paste("unzip -p",zipdir)
  datz <- rbindlist(lapply(connz, function(x) fread(x, sep=',', header = TRUE,
                                                   stringsAsFactors=FALSE, fill = Inf)),
                    use.names = TRUE, fill=TRUE)

  datz
}

# alternative function to import all csvs into R as a table
datprocessorB <- function(zipdir) {
  
  datz <- lapply(setNames(nm = unzip(zipdir, list = TRUE)$Name),
                 function(fn) fread(cmd = paste("unzip -p -a",zipdir, fn), header = TRUE,
                                    stringsAsFactors=FALSE, fill = Inf)) %>%
    rbindlist(use.names = TRUE, fill=TRUE)
  datz
}

# for loop to process import function; choose datprocessorA or datprocessorB

funx <- datprocessorA

begin <- Sys.time()
for (i in 1:length(zips)) {
  
  zipdir <- zips[i]
  
  if (i == 1) {
    import  <- funx(zipdir)
  } else {
    dat2 <- funx(zipdir)
    import  <- rbind(import, dat2, fill = TRUE)
    rm(dat2)
  }
}
end <- Sys.time()

print(paste0("Approach A length: ",round(end-begin), " seconds"))

funx <- datprocessorB

begin <- Sys.time()
for (i in 1:length(zips)) {
  
  zipdir <- zips[i]
  
  if (i == 1) {
    import  <- funx(zipdir)
  } else {
    dat2 <- funx(zipdir)
    import  <- rbind(import, dat2, fill = TRUE)
    rm(dat2)
  }
}
end <- Sys.time()

print(paste0("Approach B length: ",round(end-begin), " minutes"))


Solution

  • The following function seems to do what you want and is fast.
    It reads all files in the zips passed in a vector zip_vector, with no need for a for loop.

    datprocessorC <- function(zip_vector) {
      f <- function(zipfile) {
        fls <- unzip(zipfile, list = TRUE)$Name
        lapply(fls, \(f) fread(unzip(zipfile, files = f))) |> rbindlist(fill = TRUE)
      }
      lapply(zip_vector, f) |> rbindlist(fill = TRUE)
    }
    
    # run on the zip files vector returned by list.files()
    importC <- datprocessorC(zips)
    

    Edit

    Here is another function that reads the files with unz/readr::read_csv without unzipping. No file1.csv, file2.csv, etc are created.

    This function takes about twice the time of datprocessorC to extract the data.

    datprocessorD <- function(zip_vector) {
      f <- function(zipfile) {
        fls <- unzip(zipfile, list = TRUE)$Name
        out <- vector("list", length(fls))
        for(i in seq_along(fls)) {
          tmp <- unz(zipfile, filename = fls[i])
          out[[i]] <- readr::read_csv(file = tmp, show_col_types = FALSE)
        }
        data.table::rbindlist(out, fill = TRUE)
      }
      lapply(zip_vector, f) |> data.table::rbindlist(fill = TRUE)
    }