rtidyversepurrrr-futurefurrr

Function works with plan(sequential) but not plan(multisession)


Here is my code :

plan(multisession,workers=detectCores()-2)
future_map_dfr(.x= Liste_model[1:2],.f = summaryModel, df = DF_MODEL_TRAIN, df_test = DF_MODEL_TEST, df_global = DF_MODEL_GLOBAL, .id = "VARIABLE",.progress = T)

I got the following error :

Error in (function (.x, .f, ..., .progress = FALSE)  : 
  i In index: 1.
i With name: MODEL_PROPAL_1.
Caused by error in `vec_cbind()`:
! Can't recycle `..1` (size 3) to match `..3` (size 2).

But when I ran with plan(sequential), so without parallelization, my code runs correctly and I have the result.

Here is the function I am using, the problem concerns the last line of the function, at the "Output" step :

summaryModel <- function(df, model, df_test, df_global) {
 
  print(match(as.character(model %>% pull(formula)),as.character(Liste_model %>% map(function(x){x %>% pull(formula)}))))
  # Collection of beta parameters
  df_beta <- data.frame(beta = model$beta) %>%
    rownames_to_column("label") %>%
    mutate(
      driver = str_extract(label, "^(.*?)MERGED|^(.*?)SHIPYARD|^(.*?)ACTIF"),
      modality = sub(".*MERGED", "", label)
    ) %>%
    pivot_wider(names_from = driver, values_from = modality) %>%
    select(-label) %>%
    relocate(beta, .after = last_col())

  liste_variable <- df_beta %>%
    select(-beta) %>%
    colnames()
  liste_variable <- paste0(liste_variable, collapse = "|")

  check_pvalue <- as.data.frame(coef(summary(model))) %>%
    rownames_to_column(var = "VARIABLE") %>%
    filter(
      str_detect(VARIABLE, liste_variable) == T
    )

  if (any(check_pvalue$`Pr(>|z|)` >= 0.05)) {
    MODALITE_KO <- check_pvalue %>%
      filter(`Pr(>|z|)` >= 0.05) %>%
      pull(VARIABLE)
    MODALITE_KO <- paste(MODALITE_KO, collapse = "&")
    check_pvalue2 <- data.frame(PVALUE_CHECK = paste0("KO : ", MODALITE_KO))
  } else {
    check_pvalue2 <- data.frame(PVALUE_CHECK = "OK")
  }

  # Calculation of score
  df_beta_split <- lapply(
    1:length(model$xlevels),
    function(x) {
      df_beta %>%
        select(all_of(x), last_col()) %>%
        filter(if_all(everything(), ~ !is.na(.))) %>%
        rename_with(~ str_c(., x, sep = "_"), last_col())
    }
  )

  Calcul_gini <- function(dataframe) {
    df_score <- c(list(dataframe), df_beta_split) %>%
      reduce(left_join) %>%
      mutate(score = rowSums(across(starts_with("beta")), na.rm = T))

    # Calculation of Gini indexes
    df_score %>%
      summarise(
        gini_nor = as.numeric(2 * auc(multiclass.roc(RATING, score, quiet = T)) - 1)
      )
  }

  liste_df <- list(df, df_test, df_global)
  liste_df2 <- liste_df %>% map_dfr(~ Calcul_gini(.))
  liste_df2$DATA <- c("Train", "Test", "Global")
  liste_df2 <- liste_df2 %>% pivot_wider(names_from = DATA, values_from = gini_nor)


  # Output
  liste_df2=liste_df2 %>%
    tidytable::bind_cols("Model" = as.character(model$formula)) %>%
    tidytable::bind_cols(check_pvalue2, check_pvalue) %>%
    nest(data = VARIABLE:`Pr(>|z|)`)
  
}

I think that the issue concerns a problem of package used by the different workers, but I'm not sure of it. Any help ? thanks a lot


Solution

  • Finally found the problem, and it was indeed linked to the packages environment.

    For those facing the same problem, the solution below works, and tell to all the subprocesses to use the packages loaded in the current environmnent :

    future_map_dfr(.x= Liste_model[1:2],.f = summaryModel, df = DF_MODEL_TRAIN, df_test = DF_MODEL_TEST, df_global = DF_MODEL_GLOBAL, .id = "VARIABLE",.options = furrr_options(packages = loadedNamespaces()))