rlapplybootstrapping

Why are all the first entries in my list NA?


I have a block of code that's inside a function. It returns a list of lists (i.e. the object dfTemp would be a list of lists). When I use the function, I can see that the first value of all the internal lists is NA.

dfTemp <- parLapply(cl, 1:X,
                      function(j) {
                        library(matrixStats) #matrixStats needs to be loaded within each worker node
                        result <- colMeans2(
                          mapply(
                          function(i) rowMeans2(
                            matrix(
                              sample(sample(bdf[, 1], i), B * i, replace = TRUE), B, i),
                            na.rm = TRUE),
                          N1:N2), 
                          na.rm = TRUE)
                        return(result)  
                        }
                      )

Here's the structure of dfTemp - notice the NA value for the first row in all lists:

Browse[1]> str(dfTemp)
List of 100
 $ : num [1:15] NA 0.598 0.948 0.179 0.284 ...
 $ : num [1:15] NA 0.355 0.834 0.312 0.243 ...
 $ : num [1:15] NA 0.425 0.521 0.361 0.296 ...
 $ : num [1:15] NA 0.8166 0.0939 0.155 0.351 ...
 $ : num [1:15] NA 0 0.197 0.413 0.219 ...

Another issue I'm having, which I think may be related, is that when I try to convert my list of lists to a data frame, it names the columns by concatenating all the values.

data.frame':    15 obs. of  2 variables:
 $ c.NA..0.598119527934818..0.947653884049345..0.178908501367397..: num  NA 0.598 0.948 0.179 0.284 ...
 $ c.NA..0.354694348429857..0.833605540582925..0.312442033011144..: num  NA 0.355 0.834 0.312 0.243 ...

How do I fix these two (presumed to be related) problems?

Here's a fully reproducible example:

#### LIBRARIES ####
library(parallel)
library(tidyverse)
library(matrixStats)

#### BOOTSTRAPPING FUNCTIONS ####
bootstrap_function <- function(column, N1, N2, B, X, param) {
  bdf <- as.data.frame(column)
  colnames(bdf) <- names(column)
  
  cl <- makeCluster(detectCores() - 1)  # Enable parallel processing with n-1 cores
  clusterExport(cl, c("bdf", "N1", "N2", "B", "X", "param"), envir = environment())  # Export the objects for parallel processing
  
  dfTemp <- switch(
    param,
    "mean" = {
        parLapply(cl, 1:X,
                  function(j) {
                    library(matrixStats) #matrixStats needs to be loaded within each worker node
                    result <- colMeans2(
                      mapply(
                      function(i) rowMeans2(
                        matrix(
                          sample(sample(bdf[, 1], i), B * i, replace = TRUE), B, i),
                        na.rm = TRUE),
                      N1:N2), 
                      na.rm = TRUE)
                    return(result)  
                    }
                  )
      }, 
    "sd" = {
      parLapply(cl, 1:X, 
                function(j) {
                  browser()
                  library(matrixStats) #matrixStats needs to be loaded within each worker node
                  result <- colSds(
                    mapply(
                    function(i) rowSds(
                      matrix(
                        sample(sample(bdf[, 1], i), B * i, replace = TRUE), B, i),
                      na.rm = TRUE),
                    N1:N2), 
                    na.rm = TRUE)
                  return(result)  
                  }
                )
      }
    )
      
  stopCluster(cl)

  browser() #use for debugging... 
  
  # Convert to dataframe and then name columns
  dfTemp <- as.data.frame(dfTemp)
  #colnames(dfTemp) <- paste0("est", 1:X)

  # Add identifiers
  dfTemp <- cbind("Group" = rep(names(column), nrow(dfTemp)), dfTemp)  # Add column with group identifier
  dfTemp <- cbind("n" = row.names(dfTemp), dfTemp) # Add columnw with sample size identifer
  
  return(dfTemp)
}


#### PREP THE DATA ####
df <- data.frame(
  "1.A"= c(10.7,9.7,10.7,11.9,10,10.5,9,10.9,9.6,11.8,8.7,11.9,10.7,10.4,12.7),
  "1.B"= c(11.7,10.2,10.9,11.4,10.3,9.8,9.7,10.2,10.6,8.6,9.1,9.8,13.3,9.8,8.3),
  "2.A"= c(11.6,10.6,9.9,10,11.3,10.4,11.2,8.3,9.2,11.2,11.3,11.2,11,8,9.2),
  "2.B"= c(10.7,11.5,10.1,8.9,11.5,9.5,12.1,10.7,8.2,10.2,9.6,10.4,8.3,11.1,9.4)
)


#### RUN THE ANALYSIS ####

# Replace any zero values with NA
df[df == 0] <- NA

# define lower and upper bounds for sample sizes to estimate
N1 <- 1
N2 <- nrow(df)

# set number of bootstrap replicates
B <- 100

# set number of times to repeat the estimate
X <- 100

# define which parameter to estimate - functions for "mean" and "sd" are supported
param <- "sd"


# Apply the bootstrap function to each column of the data frame
dfBoot <- as.data.frame(
  do.call(
    rbind, lapply(seq_along(df), function(x) {
      browser() # used to enter break mode inside lapply function call for debugging
      colname <- names(df)[x]
      print(paste("Processing column:", colname))
      bootstrap_function(column = df[x], N1 = N1, N2 = N2, B = B, X = X, param = param)
    })
    )
  )

Solution

  • Well I solved my own problem. I converted the code to use apply and then added browser statements inside each internal function so I could see what was going on.

    Long story short, the issue was with trying to calculate a standard deviation off s sample size of 1. Sample lower bound needs to be at least 2. Stupid mistake on my end. Simple fix:

    # define lower and upper bounds for sample sizes to estimate
    N1 <- 2 # needs to be at least 2 
    N2 <- nrow(df)