Tidymodels parallelization using custom recipe step on Windows 11 machine

I've prepared a custom recipe step that works when parameter tuning is run sequentially, but fails when attempting to run in parallel.

I've tried doParallel psock, doFuture cluster, and doFuture multisession specifications and get a similar error

show_notes(.Last.tune.result)
#> unique notes:
#> ────────────────────────────────────────────────────────────────────────────────
#> Error in `recipes::prep()`:
#> ! You cannot `prep()` a tuneable recipe. Argument(s) with `tune()`: 'n_features'. Do you want to use a tuning function such as `tune_grid()`?

I suspect this may be related to the parallel backends on a Windows machine not receiving global variables/objects (described here and here). I'm not certain this is what is happening here, and if it is, I don't know what exactly is missing from the backend (e.g., variables, other model objects, functions defined in the custom step function, the custom step function itself).

I'm looking for help on understanding what's going on here and how to control these things when trying to tune in parallel when using a custom recipe step.

If its the functions that are missing, is there a way to pass them directly to the backends or do I need to put them in a package to call in the backend options?

Here's a reproducible example.

Create the step function:

# HOUSEKEEPING ####
rm(list = ls(all = TRUE)) # clean house
# CRAN libraries
library(tidyverse) # install.packages("tidyverse")
library(tidymodels) # install.packages("tidymodels")
library(finetune) # install.packages("finetune")
#> Warning: package 'finetune' was built under R version 4.3.2
library(doParallel) # install.packages("finetune")
#> Warning: package 'doParallel' was built under R version 4.3.2
#> Loading required package: foreach
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
library(reprex) # install.packages("reprex")
#> Warning: package 'reprex' was built under R version 4.3.2
# Bioconductor libraries
library(limma) #  BiocManager::install("limma")
# set conflict preference
tidymodels_prefer()


# DEFINE step_limma ####
step_limma <- function(
    recipe,
    ...,
    role = NA,
    trained = FALSE,
    include_covariate = FALSE,
    covariate_id = NULL,
    remove_corr = FALSE,
    corr_threshold = NULL,
    n_features = NULL,
    removals = NULL,
    skip = FALSE,
    id = rand_id("limma")) {
    add_step(
        recipe,
        step_limma_new(
            terms = enquos(...),
            role = role,
            trained = trained,
            include_covariate = include_covariate,
            covariate_id = covariate_id,
            remove_corr = remove_corr,
            corr_threshold = corr_threshold,
            n_features = n_features,
            removals = removals,
            skip = skip,
            id = id
        )
    )
}

step_limma_new <- function(
    terms, role, trained, include_covariate, covariate_id,
    remove_corr, corr_threshold, n_features, removals, skip, id) {
    step(
        subclass = "limma",
        terms = terms,
        role = role,
        trained = trained,
        include_covariate = include_covariate,
        covariate_id = covariate_id,
        remove_corr = remove_corr,
        corr_threshold = corr_threshold,
        n_features = n_features,
        removals = removals,
        skip = skip,
        id = id
    )
}

prep.step_limma <- function(x, training, info = NULL, ...) {
    col_names <- recipes_eval_select(x$terms, training, info)
    check_type(training[, col_names], types = c("double", "integer"))

    target <- info %>%
        dplyr::filter(role == "outcome") %>%
        pull(variable)
    covariate_id <- x$covariate_id
    tar <- training %>% pull(!!target)

    df_limma <- training %>%
        dplyr::select(all_of(col_names)) %>%
        dplyr::select(-all_of(covariate_id)) %>%
        t()
    if (x$include_covariate) {
        if (length(x$covariate_id) == 0) {
            rlang::abort("include_covariate == TRUE but covariate_id was not specified\nspecify covariate_id")
        }
        if (length(x$covariate_id) > 1) {
            rlang::abort("more than one covariate_id specified\nstep_limma only supports 1 covariate at presentF")
        }
        covar <- training %>% pull(covariate_id)
        design <- model.matrix(~ tar + covar)
    } else {
        design <- model.matrix(~tar)
    }
    fit <- limma::lmFit(df_limma, design)
    ebayes <- limma::eBayes(fit)
    tab <- limma::topTable(ebayes, coef = 2, adjust = "fdr", sort.by = "p", n = "all")
    if (x$remove_corr) {
        corr_mat <- training %>%
            dplyr::select(all_of(col_names)) %>%
            dplyr::select(row.names(tab[1:x$n_features, ])) %>%
            cor(method = "spearman", use = "p")
        cor_features <- caret::findCorrelation(corr_mat, cutoff = x$corr_threshold)
        removals <- c(covariate_id, rownames(tab)[-cor_features])
    } else {
        removals <- c(
            covariate_id,
            tab %>%
                dplyr::slice((x$n_features + 1):nrow(tab)) %>%
                row.names()
        )
    }

    step_limma_new(
        terms = x$terms,
        trained = TRUE,
        role = x$role,
        include_covariate = x$include_covariate,
        covariate_id = x$covariate_id,
        remove_corr = x$remove_corr,
        corr_threshold = x$corr_threshold,
        n_features = x$n_features,
        removals = removals,
        skip = x$skip,
        id = x$id
    )
}

bake.step_limma <- function(object, new_data, ...) {
    new_data <- recipes_remove_cols(new_data, object)
}

print.step_limma <- function(x, width = max(20, options()$width - 35), ...) {
    title <- "Variables selected "
    print_step(names(x$n_features), x$terms, x$trained, title, width)
    invisible(x)
}

tidy.step_limma <- function(x, ...) {
    if (is_trained(x)) {
        res <- tibble(features = names(x$features))
    } else {
        var_expr <- map(x$features, quo_get_expr)
        var_expr <- map_chr(var_expr, quo_text, width = options()$width, nlines = 1)
        res <- tibble(features = unname(var_expr))
    }
    res$id <- x$id
    res
}

num_features <- function(range = c(10L, 1000L), trans = NULL) {
    new_quant_param(
        type = "integer",
        range = range,
        inclusive = c(TRUE, TRUE),
        trans = trans,
        label = c(n_features = "# features"),
        finalize = NULL
    )
}

include_covariate <- function(values = c(TRUE, FALSE)) {
    new_qual_param(
        type = "logical",
        values = values,
        label = c(include_covariate = "covariate"),
    )
}

corr_threshold <- function(range = c(0.7, 0.9), trans = NULL) {
    new_quant_param(
        type = "double",
        range = range,
        inclusive = c(TRUE, TRUE),
        trans = trans,
        label = c(corr_threshold = "correlation threshold"),
        finalize = NULL
    )
}

remove_corr <- function(values = c(TRUE, FALSE)) {
    new_qual_param(
        type = "logical",
        values = values,
        label = c(remove_corr = "remove correlated features"),
    )
}

tunable.step_limma <- function(x, ...) {
    tibble::tibble(
        name = c(
            "n_features",
            "include_covariate",
            "remove_corr",
            "corr_threshold"
        ),
        call_info = list(
            list(fun = "num_features"),
            list(fun = "include_covariate"),
            list(fun = "remove_corr"),
            list(fun = "corr_threshold")
        ),
        source = "recipe",
        component = "step_limma",
        component_id = x$id
    )
}

required_pkgs.step_limma <- function(x, ...) {
    c("limma", "caret", "yardstick")
}

Test it sequentially and in parallel:

# DEFINE SEED ####
seed <- 42


# DEFINE SET ####
set.seed(seed)
set <- matrix(runif(1000, min = 1, max = 10), nrow = 1000, ncol = 100) %>%
    data.frame() %>%
    tibble() %>%
    mutate(
        target = sample(c(0, 1), 1000, replace = TRUE) %>% factor(),
        .before = 1
    )


# DEFINE TRAINING AND VALIDATION SETS ####
set.seed(seed)
set_split <- initial_split(set, strata = target)
set_train <- training(set_split)
set_test <- testing(set_split)


# SET UP RESAMPLING ####
set.seed(seed)
cv_folds <- set_train %>% vfold_cv(v = 5, strata = target)


# BUILD MODELS ####
mod_svmlinear <- svm_linear(cost = tune(), margin = tune()) %>%
    set_engine("kernlab") %>%
    set_mode("classification")


# BUILD RECIPE ####
recipe_set <- recipe(target ~ ., data = set_train) %>%
    step_limma(
        all_predictors(),
        n_features = tune()
    )


# CREATE WORKFLOWS ####
workflow_spec <- workflow() %>%
    add_recipe(recipe_set) %>%
    add_model(mod_svmlinear)


# CHECK IF step_limma WORKS IF TUNING IS DONE SEQUANTIALLY ####
set.seed(seed)
res_tune_sequential <- workflow_spec %>%
    tune_race_win_loss(
        resamples = cv_folds,
        metrics = metric_set(roc_auc),
        grid = 5,
        control = control_race(
            save_pred = TRUE,
            event_level = "second",
            save_workflow = TRUE,
            allow_par = FALSE
        )
    )
res_tune_sequential %>% collect_metrics()
#> # A tibble: 5 × 9
#>       cost  margin n_features .metric .estimator  mean     n std_err .config    
#>      <dbl>   <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
#> 1 11.1     0.153          419 roc_auc binary     0.509     5 0.0113  Preprocess…
#> 2  0.0120  0.00566        948 roc_auc binary     0.481     5 0.00746 Preprocess…
#> 3  1.63    0.0461         374 roc_auc binary     0.504     5 0.0120  Preprocess…
#> 4  0.00497 0.0859         133 roc_auc binary     0.507     5 0.0116  Preprocess…
#> 5  0.0836  0.194          687 roc_auc binary     0.493     5 0.0116  Preprocess…


# CHECK IF step_limma WORKS IF TUNING IS DONE IN PARALLEL ####
cl <- parallel::makePSOCKcluster(5)
doParallel::registerDoParallel(cl)
set.seed(seed)

res_tune <- workflow_spec %>%
    tune_race_win_loss(
        resamples = cv_folds,
        metrics = metric_set(roc_auc),
        grid = 5,
        control = control_race(
            save_pred = TRUE,
            event_level = "second",
            save_workflow = TRUE,
            allow_par = TRUE
        )
    )
#> Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
#> information.
#> Error in `dplyr::filter()`:
#> ℹ In argument: `.metric == metric`.
#> Caused by error:
#> ! object '.metric' not found
#> Backtrace:
#>      ▆
#>   1. ├─workflow_spec %>% ...
#>   2. ├─finetune::tune_race_win_loss(...)
#>   3. ├─finetune:::tune_race_win_loss.workflow(...)
#>   4. │ └─finetune:::tune_race_win_loss_workflow(...)
#>   5. │   └─finetune:::test_parameters_bt(res, control$alpha)
#>   6. │     └─tune::collect_metrics(x, summarize = FALSE) %>% ...
#>   7. ├─dplyr::filter(., .metric == metric)
#>   8. ├─dplyr:::filter.data.frame(., .metric == metric)
#>   9. │ └─dplyr:::filter_rows(.data, dots, by)
#>  10. │   └─dplyr:::filter_eval(...)
#>  11. │     ├─base::withCallingHandlers(...)
#>  12. │     └─mask$eval_all_filter(dots, env_filter)
#>  13. │       └─dplyr (local) eval()
#>  14. └─base::.handleSimpleError(...)
#>  15.   └─dplyr (local) h(simpleError(msg, call))
#>  16.     └─rlang::abort(message, class = error_class, parent = parent, call = error_call)

parallel::stopCluster(cl)

Session info:

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.1 (2023-06-16 ucrt)
#>  os       Windows 11 x64 (build 22621)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_Canada.utf8
#>  ctype    English_Canada.utf8
#>  tz       America/Edmonton
#>  date     2023-11-28
#>  pandoc   3.1.6.1 @ C:/PROGRA~1/Pandoc/ (via rmarkdown)

Solution

Thanks @topepo for the suggestion to use some parallel controls to pass the functions to the backends. Perhaps not the most elegant solution but it seems to be working.

Here's a reproduction of the entire process

Create the step function:

# HOUSEKEEPING ####
rm(list = ls(all = TRUE)) # clean house
# CRAN libraries
library(tidyverse) # install.packages("tidyverse")
library(tidymodels) # install.packages("tidymodels")
library(finetune) # install.packages("finetune")
#> Warning: package 'finetune' was built under R version 4.3.2
library(doParallel) # install.packages("finetune")
#> Warning: package 'doParallel' was built under R version 4.3.2
#> Loading required package: foreach
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
library(bench) # install.packages("bench")
#> Warning: package 'bench' was built under R version 4.3.2
library(reprex) # install.packages("reprex")
#> Warning: package 'reprex' was built under R version 4.3.2
# Bioconductor libraries
library(limma) #  BiocManager::install("limma")


# DEFINE step_limma ####
step_limma <- function(
    recipe,
    ...,
    role = NA,
    trained = FALSE,
    include_covariate = FALSE,
    covariate_id = NULL,
    remove_corr = FALSE,
    corr_threshold = NULL,
    n_features = NULL,
    removals = NULL,
    skip = FALSE,
    id = rand_id("limma")) {
    add_step(
        recipe,
        step_limma_new(
            terms = enquos(...),
            role = role,
            trained = trained,
            include_covariate = include_covariate,
            covariate_id = covariate_id,
            remove_corr = remove_corr,
            corr_threshold = corr_threshold,
            n_features = n_features,
            removals = removals,
            skip = skip,
            id = id
        )
    )
}

step_limma_new <- function(terms, role, trained, include_covariate, covariate_id,
    remove_corr, corr_threshold, n_features, removals, skip, id) {
    step(
        subclass = "limma",
        terms = terms,
        role = role,
        trained = trained,
        include_covariate = include_covariate,
        covariate_id = covariate_id,
        remove_corr = remove_corr,
        corr_threshold = corr_threshold,
        n_features = n_features,
        removals = removals,
        skip = skip,
        id = id
    )
}

prep.step_limma <- function(x, training, info = NULL, ...) {
    col_names <- recipes_eval_select(x$terms, training, info)
    check_type(training[, col_names], types = c("double", "integer"))

    target <- info %>%
        dplyr::filter(role == "outcome") %>%
        pull(variable)
    covariate_id <- x$covariate_id
    tar <- training %>% pull(!!target)

    df_limma <- training %>%
        dplyr::select(all_of(col_names)) %>%
        dplyr::select(-all_of(covariate_id)) %>%
        t()
    if (x$include_covariate) {
        if (length(x$covariate_id) == 0) {
            rlang::abort("include_covariate == TRUE but covariate_id was not specified\nspecify covariate_id")
        }
        if (length(x$covariate_id) > 1) {
            rlang::abort("more than one covariate_id specified\nstep_limma only supports 1 covariate at presentF")
        }
        covar <- training %>% pull(covariate_id)
        design <- model.matrix(~ tar + covar)
    } else {
        design <- model.matrix(~tar)
    }
    fit <- limma::lmFit(df_limma, design)
    ebayes <- limma::eBayes(fit)
    tab <- limma::topTable(ebayes, coef = 2, adjust = "fdr", sort.by = "p", n = "all")
    if (x$remove_corr) {
        corr_mat <- training %>%
            dplyr::select(all_of(col_names)) %>%
            dplyr::select(row.names(tab[1:x$n_features, ])) %>%
            cor(method = "spearman", use = "p")
        cor_features <- caret::findCorrelation(corr_mat, cutoff = x$corr_threshold)
        removals <- c(covariate_id, rownames(tab)[-cor_features])
    } else {
        removals <- c(
            covariate_id,
            tab %>%
                dplyr::slice((x$n_features + 1):nrow(tab)) %>%
                row.names()
        )
    }

    step_limma_new(
        terms = x$terms,
        trained = TRUE,
        role = x$role,
        include_covariate = x$include_covariate,
        covariate_id = x$covariate_id,
        remove_corr = x$remove_corr,
        corr_threshold = x$corr_threshold,
        n_features = x$n_features,
        removals = removals,
        skip = x$skip,
        id = x$id
    )
}

bake.step_limma <- function(object, new_data, ...) {
    new_data <- recipes_remove_cols(new_data, object)
}

print.step_limma <- function(x, width = max(20, options()$width - 35), ...) {
    title <- "Variables selected "
    print_step(names(x$n_features), x$terms, x$trained, title, width)
    invisible(x)
}

tidy.step_limma <- function(x, ...) {
    if (is_trained(x)) {
        res <- tibble(features = names(x$features))
    } else {
        var_expr <- map(x$features, quo_get_expr)
        var_expr <- map_chr(var_expr, quo_text, width = options()$width, nlines = 1)
        res <- tibble(features = unname(var_expr))
    }
    res$id <- x$id
    res
}

num_features <- function(range = c(10L, 1000L), trans = NULL) {
    new_quant_param(
        type = "integer",
        range = range,
        inclusive = c(TRUE, TRUE),
        trans = trans,
        label = c(n_features = "# features"),
        finalize = NULL
    )
}

include_covariate <- function(values = c(TRUE, FALSE)) {
    new_qual_param(
        type = "logical",
        values = values,
        label = c(include_covariate = "covariate"),
    )
}

corr_threshold <- function(range = c(0.7, 0.9), trans = NULL) {
    new_quant_param(
        type = "double",
        range = range,
        inclusive = c(TRUE, TRUE),
        trans = trans,
        label = c(corr_threshold = "correlation threshold"),
        finalize = NULL
    )
}

remove_corr <- function(values = c(TRUE, FALSE)) {
    new_qual_param(
        type = "logical",
        values = values,
        label = c(remove_corr = "remove correlated features"),
    )
}

tunable.step_limma <- function(x, ...) {
    tibble::tibble(
        name = c(
            "n_features",
            "include_covariate",
            "remove_corr",
            "corr_threshold"
        ),
        call_info = list(
            list(fun = "num_features"),
            list(fun = "include_covariate"),
            list(fun = "remove_corr"),
            list(fun = "corr_threshold")
        ),
        source = "recipe",
        component = "step_limma",
        component_id = x$id
    )
}

required_pkgs.step_limma <- function(x, ...) {
    c("limma", "caret", "yardstick", "dials")
}

Define the data and workflow:

# DEFINE SEED ####
seed <- 42


# DEFINE SET ####
set.seed(seed)
set <- matrix(runif(1000, min = 1, max = 10), nrow = 1000, ncol = 100) %>%
    data.frame() %>%
    tibble() %>%
    mutate(
        target = sample(c(0, 1), 1000, replace = TRUE) %>% factor(),
        .before = 1
    )


# DEFINE TRAINING AND VALIDATION SETS ####
set.seed(seed)
set_split <- initial_split(set, strata = target)
set_train <- training(set_split)
set_test <- testing(set_split)


# SET UP RESAMPLING ####
set.seed(seed)
cv_folds <- set_train %>% vfold_cv(v = 5, strata = target)


# BUILD MODELS ####
mod_svmlinear <- svm_linear(cost = tune(), margin = tune()) %>%
    set_engine("kernlab") %>%
    set_mode("classification")


# BUILD RECIPE ####
recipe_set <- recipe(target ~ ., data = set_train) %>%
    step_limma(
        all_predictors(),
        n_features = tune()
    )


# CREATE WORKFLOWS ####
workflow_spec <- workflow() %>%
    add_recipe(recipe_set) %>%
    add_model(mod_svmlinear)

Set up workers and give them what they need


# doParallel TUNING  ####
cl <- parallel::makeCluster(20, type = "PSOCK")
parallel::clusterExport(
    cl,
    c(
        ls(pattern = "*\\.step_limma"),
        "step_limma_new",
        "num_features",
        "remove_corr",
        "include_covariate",
        "corr_threshold"
    )
)

Look at what we've assigned to the workers

parallel::clusterEvalQ(
    cl,
    c(
        ls(pattern = "*\\.step_limma"),
        "step_limma_new",
        "num_features",
        "remove_corr",
        "include_covariate",
        "corr_threshold"
    )
)
#> [[1]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[2]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[3]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[4]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[5]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[6]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[7]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[8]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[9]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[10]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[11]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[12]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[13]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[14]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[15]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[16]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[17]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[18]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[19]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"          
#> 
#> [[20]]
#>  [1] "bake.step_limma"          "prep.step_limma"         
#>  [3] "print.step_limma"         "required_pkgs.step_limma"
#>  [5] "tidy.step_limma"          "tunable.step_limma"      
#>  [7] "step_limma_new"           "num_features"            
#>  [9] "remove_corr"              "include_covariate"       
#> [11] "corr_threshold"

Measure performance of parameter tuning sequentially, over resamples, or over everything

doParallel::registerDoParallel(cl)

set.seed(seed)
res_mark <- bench::mark(
    sequential = workflow_spec %>%
        tune_race_win_loss(
            resamples = cv_folds,
            metrics = yardstick::metric_set(roc_auc),
            grid = 10,
            control = control_race(
                save_pred = TRUE,
                event_level = "second",
                save_workflow = TRUE,
                allow_par = FALSE
            )
        ),
    doParallel_resamples = workflow_spec %>%
        tune_race_win_loss(
            resamples = cv_folds,
            metrics = yardstick::metric_set(roc_auc),
            grid = 10,
            control = control_race(
                save_pred = TRUE,
                event_level = "second",
                save_workflow = TRUE,
                allow_par = TRUE,
                parallel_over = "resamples",
                backend_options = tune::new_backend_options(
                    tune_race_win_loss = list(pkgs = "dials"),
                    class = "tune_race_win_loss"
                )
            )
        ),
    doParallel_everything = workflow_spec %>%
        tune_race_win_loss(
            resamples = cv_folds,
            metrics = yardstick::metric_set(roc_auc),
            grid = 10,
            control = control_race(
                save_pred = TRUE,
                event_level = "second",
                save_workflow = TRUE,
                allow_par = TRUE,
                parallel_over = "everything",
                backend_options = tune::new_backend_options(
                    tune_race_win_loss = list(pkgs = "dials"),
                    class = "tune_race_win_loss"
                )
            )
        ),
    iterations = 10,
    check = FALSE,
    filter_gc = FALSE
)

parallel::stopCluster(cl)



# COMPARE RESULTS ####
res_mark %>%
    mutate(expression = expression %>% as.character()) %>%
    arrange(expression) %>%
    dplyr::select(1:9)
#> # A tibble: 3 × 6
#>   expression                 min   median `itr/sec` mem_alloc `gc/sec`
#>   <chr>                 <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 doParallel_everything    4.68s    7.02s    0.145    23.79MB    0.203
#> 2 doParallel_resamples    10.63s   12.33s    0.0772   19.74MB    0.108
#> 3 sequential              16.51s   18.52s    0.0534    2.44GB    0.619

^{Created on 2023-11-28 with reprex v2.0.2}

Session info

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.1 (2023-06-16 ucrt)
#>  os       Windows 11 x64 (build 22621)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_Canada.utf8
#>  ctype    English_Canada.utf8
#>  tz       America/Edmonton
#>  date     2023-11-28
#>  pandoc   3.1.6.1 @ C:/PROGRA~1/Pandoc/ (via rmarkdown)