rforeachdoparallelmclapply

R foreach: Read and manipulate multiple files in parallel


I have 500 tar.xz files containing 2000 csv files. I need to untar them a few tar files at a time (because of disk space), process them into a data.table, delete the csv files from disk and then save the result as RDS before moving on to the next few tar file.

My function works fine in serial but in parallel it gets the files mixed up between cores. Why is this?

Some sample data:

    for(j in 1:5){
     for(i in 1:5){
      a<-df[sample(x = 1:nrow(df), size = 50, replace = TRUE),]
      write.csv(a,paste0("seed_",i,".csv"))
      lf<-list.files(pattern=".csv")
                  }
     tar(tarfile = paste0("seed_",j,".tar"),files = lf,compression = c("xz"), tar="tar")
                 }

Example code with foreach

require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)

#List all tar files in directory
list_of_files<-list.files(pattern = ".tar")

  packsINeed<-c("vroom","magrittr","dplyr","tidyr","doParallel")
    
  #Start for loop

myCluster<-makeCluster(6,type="PSOCK")
registerDoParallel(myCluster) 

  foreach(i= 1:NROW(list_of_files),.packages = packsINeed)%dopar%{

print(paste(list_of_files[i], "which is", i, "of", NROW(list_of_files) ))

print("2. Untar .csv files inside")
 untar(tarfile = list_of_files[i], exdir = "tempOutputFiles")



 print("#3. Read in files and add up two columns")
df<-vroom::vroom(list.files("tempOutputFiles/$.csv"), id="path")

df$A<-df$B+df$C

    print("#4. save RDS")

saveRDS(object = df, file = paste0(tools::file_path_sans_ext(list_of_files[i], compression = TRUE),".rds"))

 print("#5. Clean up files")

.files<-list.files("tempOutputFiles",pattern=".csv")

    file.remove(basename(.files))
}

Using mclapply - behaves the same

require(dplyr)
require(tidyr)
require(foreach)
require(doParallel)
require(magrittr)

#List all tar files in directory
list_of_files<-list.files(pattern = ".tar")

myParFun<-fun(filename){

print(paste(filename))

print("2. Untar all .csv files inside")
 untar(tarfile = filename, exdir = "tempOutputFiles")



 print("#3. Read in files and add up two columns")
df<-vroom::vroom(list.files("tempOutputFiles/$.csv"), id="path")

df$A<-df$B+df$C

    print("#4. save RDS")

saveRDS(object = df, file = paste0(tools::file_path_sans_ext(filename, compression = TRUE),".rds"))

 print("#5. Clean up files")

   .files<-list.files("tempOutputFiles",pattern=".csv")

    file.remove(.files)
}

mclapply(FUN=myParFun, list_of_files, mc.cores=4)

Based on Waldi's comment I've created a directory for each file in list_of_files and it now works fine. But is there snore approach? Using tempdir for example?


Solution

  • As suggested in comments, the code below creates one directory per process / tar file, untars, merges the CSVs in a .rds file and deletes them.
    Note that it seems that vroom needs the altrep = FALSE argument to avoid a permission denied error at deletion.

    # Generate sample tars for test
    write.csv(mtcars,'file1.csv')
    write.csv(mtcars,'file2.csv')
    write.csv(iris,'file3.csv')
    write.csv(iris,'file4.csv')
    tar('tar1.tar',files=c('file1.csv','file2.csv'),tar="tar")
    tar('tar2.tar',files=c('file3.csv','file4.csv'),tar="tar")
    
    require(dplyr)
    require(tidyr)
    require(foreach)
    require(doParallel)
    require(magrittr)
    
    #List all tar files in directory
    list_of_files<-list.files(pattern = "\\.tar")
    
    packsINeed<-c("vroom","magrittr","dplyr","tidyr","doParallel")
    
    #Start for loop
    
    myCluster<-makeCluster(2,type="PSOCK")
    registerDoParallel(myCluster) 
    
    foreach(i= 1:NROW(list_of_files),.packages = packsINeed)%dopar%{
      print(paste(list_of_files[i], "which is", i, "of", NROW(list_of_files) ))
      
      print("2. Untar .csv files inside")
      fileout <- tools::file_path_sans_ext(list_of_files[i], compression = TRUE)
      exdir <- paste0("temp",fileout)
      untar(tarfile = list_of_files[i], exdir = exdir)
      
      print("#3. Read in files and add up two columns")
      df<-vroom::vroom(file.path(exdir,dir(exdir,"*.csv")),altrep = FALSE)
      
      # df$A<-df$B+df$C   # These columns don't exist in mtcars used as example
      
      print("#4. save RDS")
      
      saveRDS(object = df, file = file.path(exdir,paste0(fileout,".rds")))
      
      print("#5. Clean up files")
      
      .files<-list.files(exdir,pattern="\\.csv")
      
      file.remove(file.path(exdir,.files))
    }
    

    Not sure where the .rds should go, so left for the time being in the temporary folder.