Here is my code :
plan(multisession,workers=detectCores()-2)
future_map_dfr(.x= Liste_model[1:2],.f = summaryModel, df = DF_MODEL_TRAIN, df_test = DF_MODEL_TEST, df_global = DF_MODEL_GLOBAL, .id = "VARIABLE",.progress = T)
I got the following error :
Error in (function (.x, .f, ..., .progress = FALSE) :
i In index: 1.
i With name: MODEL_PROPAL_1.
Caused by error in `vec_cbind()`:
! Can't recycle `..1` (size 3) to match `..3` (size 2).
But when I ran with plan(sequential), so without parallelization, my code runs correctly and I have the result.
Here is the function I am using, the problem concerns the last line of the function, at the "Output" step :
summaryModel <- function(df, model, df_test, df_global) {
print(match(as.character(model %>% pull(formula)),as.character(Liste_model %>% map(function(x){x %>% pull(formula)}))))
# Collection of beta parameters
df_beta <- data.frame(beta = model$beta) %>%
rownames_to_column("label") %>%
mutate(
driver = str_extract(label, "^(.*?)MERGED|^(.*?)SHIPYARD|^(.*?)ACTIF"),
modality = sub(".*MERGED", "", label)
) %>%
pivot_wider(names_from = driver, values_from = modality) %>%
select(-label) %>%
relocate(beta, .after = last_col())
liste_variable <- df_beta %>%
select(-beta) %>%
colnames()
liste_variable <- paste0(liste_variable, collapse = "|")
check_pvalue <- as.data.frame(coef(summary(model))) %>%
rownames_to_column(var = "VARIABLE") %>%
filter(
str_detect(VARIABLE, liste_variable) == T
)
if (any(check_pvalue$`Pr(>|z|)` >= 0.05)) {
MODALITE_KO <- check_pvalue %>%
filter(`Pr(>|z|)` >= 0.05) %>%
pull(VARIABLE)
MODALITE_KO <- paste(MODALITE_KO, collapse = "&")
check_pvalue2 <- data.frame(PVALUE_CHECK = paste0("KO : ", MODALITE_KO))
} else {
check_pvalue2 <- data.frame(PVALUE_CHECK = "OK")
}
# Calculation of score
df_beta_split <- lapply(
1:length(model$xlevels),
function(x) {
df_beta %>%
select(all_of(x), last_col()) %>%
filter(if_all(everything(), ~ !is.na(.))) %>%
rename_with(~ str_c(., x, sep = "_"), last_col())
}
)
Calcul_gini <- function(dataframe) {
df_score <- c(list(dataframe), df_beta_split) %>%
reduce(left_join) %>%
mutate(score = rowSums(across(starts_with("beta")), na.rm = T))
# Calculation of Gini indexes
df_score %>%
summarise(
gini_nor = as.numeric(2 * auc(multiclass.roc(RATING, score, quiet = T)) - 1)
)
}
liste_df <- list(df, df_test, df_global)
liste_df2 <- liste_df %>% map_dfr(~ Calcul_gini(.))
liste_df2$DATA <- c("Train", "Test", "Global")
liste_df2 <- liste_df2 %>% pivot_wider(names_from = DATA, values_from = gini_nor)
# Output
liste_df2=liste_df2 %>%
tidytable::bind_cols("Model" = as.character(model$formula)) %>%
tidytable::bind_cols(check_pvalue2, check_pvalue) %>%
nest(data = VARIABLE:`Pr(>|z|)`)
}
I think that the issue concerns a problem of package used by the different workers, but I'm not sure of it. Any help ? thanks a lot
Finally found the problem, and it was indeed linked to the packages environment.
For those facing the same problem, the solution below works, and tell to all the subprocesses to use the packages loaded in the current environmnent :
future_map_dfr(.x= Liste_model[1:2],.f = summaryModel, df = DF_MODEL_TRAIN, df_test = DF_MODEL_TEST, df_global = DF_MODEL_GLOBAL, .id = "VARIABLE",.options = furrr_options(packages = loadedNamespaces()))