rparallel-processingopenxlsx

openxlsx and writeData in parallel


I'm trying to parallelise with openxlsx and its function writeData the export of many Excel individual files that should be somehow summarised in a central Excel file. As can be shown in the reprex below I first create the central Excel file before creating the parallel processes and then I create the individual files in their respective parallel clusters. The problem is that nothing (i.e. no 'summary', here simply the Row counter) is written in the central Excel file, although I exported its associated workbook object to the parallel processes.

lapply(c("openxlsx", "parallel"), library, character.only = TRUE)

NameOutputFolder <- "Output"
NameOutputSubfolder <- "Individual files"

OutputFolder <- file.path(".", NameOutputFolder)
if(!dir.exists(OutputFolder))   dir.create(OutputFolder)
OutputSubfolder <- file.path(".", NameOutputFolder, NameOutputSubfolder)
if(!dir.exists(OutputSubfolder))    dir.create(OutputSubfolder)

OutputFile_Central <- file.path(OutputFolder, "Excel_Central.xlsx")

Workbook_Central <- createWorkbook()
addWorksheet(wb = Workbook_Central, sheetName = "Summary", zoom = 80, gridLines = FALSE)

no_cores <- detectCores()
print(paste0("Configuring parallelisation (", no_cores, " cores found) and setting up clusters"))
MyCluster <- makePSOCKcluster(no_cores - 1)
clusterEvalQ(MyCluster, {
    library(openxlsx)
})
clusterExport(MyCluster, c("OutputSubfolder", "Workbook_Central"))
parLapply(cl = MyCluster, X = 1:10, fun = function(Row){
    OutputFile_Individual <- file.path(OutputSubfolder, paste0("Excel_Individual_", Row, ".xlsx"))
    Workbook_Individual <- createWorkbook()
    writeData(wb = Workbook_Central, sheet = "Summary", x = Row, startCol = 1, startRow = Row)
    saveWorkbook(wb = Workbook_Individual, file = OutputFile_Individual, overwrite = TRUE)
})
stopCluster(MyCluster)
saveWorkbook(wb = Workbook_Central, file = OutputFile_Central, overwrite = TRUE)

I guess the solution would be to create temporary central Excel files in each parallel process (indexed by Sys.getpid()) and then to merge them in the unique central Excel file after the parallel code has run, right? No other solution?


Solution

  • With clusterExport(MyCluster, c("OutputSubfolder", "Workbook_Central")) you are copying master's OutputSubfolder & Workbook_Central objects to node processes, all changes are just local. And not just local to the function environment, those object live in separate R processes.

    You might want to return summary data (Row in this example) from parLapplyed function and handle Workbook_Central in your master process when working with a parLapply() result. This would also align better with common lapply() usage pattern -- stuff goes in, transformed stuff with the same length comes out, no (or minimal) side effects from applied function.

    lapply(c("openxlsx", "parallel"), library, character.only = TRUE)
    
    NameOutputFolder <- "Output"
    NameOutputSubfolder <- "Individual files"
    
    OutputFolder <- file.path(".", NameOutputFolder)
    if(!dir.exists(OutputFolder))   dir.create(OutputFolder)
    OutputSubfolder <- file.path(".", NameOutputFolder, NameOutputSubfolder)
    if(!dir.exists(OutputSubfolder))    dir.create(OutputSubfolder)
    
    OutputFile_Central <- file.path(OutputFolder, "Excel_Central.xlsx")
    
    no_cores <- detectCores()
    print(paste0("Configuring parallelisation (", no_cores, " cores found) and setting up clusters"))
    #> [1] "Configuring parallelisation (8 cores found) and setting up clusters"
    MyCluster <- makePSOCKcluster(no_cores - 1)
    clusterEvalQ(MyCluster, {
      library(openxlsx)
    })
    clusterExport(MyCluster, c("OutputSubfolder"))
    
    # collect summary data into resulting_list
    resulting_list <- parLapply(cl = MyCluster, X = 1:10, fun = function(Row){
      OutputFile_Individual <- file.path(OutputSubfolder, paste0("Excel_Individual_", Row, ".xlsx"))
      Workbook_Individual <- createWorkbook()
      saveWorkbook(wb = Workbook_Individual, file = OutputFile_Individual, overwrite = TRUE)
      # return summary (a scalar, vector, list, data.frame, ...):
      Row
    })
    stopCluster(MyCluster)
    
    # structure of resulting_list
    str(resulting_list)
    #> List of 10
    #>  $ : int 1
    #>  $ : int 2
    #>  $ : int 3
    #>  $ : int 4
    #>  $ : int 5
    #>  $ : int 6
    #>  $ : int 7
    #>  $ : int 8
    #>  $ : int 9
    #>  $ : int 10
    
    # transform it for writeData(), writing a list would store it at A1:J1 
    (summary_data <- unlist(resulting_list))
    #>  [1]  1  2  3  4  5  6  7  8  9 10
    
    # create & save Workbook_Central
    Workbook_Central <- createWorkbook()
    addWorksheet(wb = Workbook_Central, sheetName = "Summary", zoom = 80, gridLines = FALSE)
    writeData(wb = Workbook_Central, sheet = "Summary", x = summary_data, startCol = 1, startRow = 1)
    saveWorkbook(wb = Workbook_Central, file = OutputFile_Central, overwrite = TRUE)
    

    Resulting files:

    fs::dir_tree(OutputFolder)
    #> ./Output
    #> ├── Excel_Central.xlsx
    #> └── Individual files
    #>     ├── Excel_Individual_1.xlsx
    #>     ├── Excel_Individual_10.xlsx
    #>     ├── Excel_Individual_2.xlsx
    #>     ├── Excel_Individual_3.xlsx
    #>     ├── Excel_Individual_4.xlsx
    #>     ├── Excel_Individual_5.xlsx
    #>     ├── Excel_Individual_6.xlsx
    #>     ├── Excel_Individual_7.xlsx
    #>     ├── Excel_Individual_8.xlsx
    #>     └── Excel_Individual_9.xlsx