hyperparameterstidymodels

Extract hyperparameter values from workflowset in Tidymodels


I wish to plot the hyper-parameter performance (RMSE and RSQ) from a workflowset in Tidymodels however I'm struggling to piece together the syntax.

I'm trying to replicate the plot below, taken from here:

How can I extract the hyperparameters from my race results?

enter image description here

# load the housing data and clean names
ames_data <- make_ames() %>%
  janitor::clean_names() %>% 
  mutate(sale_price_log = log10(sale_price))

# SPLIT INTO TRAINING AND TESTING DATA. STRATIFY BY SALE PRICE
ames_split <- rsample::initial_split(
  ames_data %>% select(-sale_price),
  prop = 0.8,
  strata = sale_price_log
)

# CREATE TRAINING AND TESTING OBJECTS FROM THE SPLIT OBJECT
ames_train <- training(ames_split)
ames_test <- testing(ames_split)

# CREATE RESAMPLES TO CHOOSE AND COMPARE MODELS
set.seed(234)
ames_folds <- vfold_cv(ames_train, strata = sale_price_log, v = 5)

# DEFINE PREPROCESSING RECIPES --------------------------------------------

base_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
  # APPLYING LOG TRANSFORMATION TO SALE_PRICE AND GR_LIV_AREA TO ADDRESS SKEWNESS
  step_log(gr_liv_area, base = 10) %>%
  # CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
  step_dummy(all_nominal_predictors(), one_hot = TRUE)

normalise_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
  # REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
  step_nzv(all_nominal_predictors()) %>%
  # HANDLING RARE FACTOR LEVELS IN NEIGHBORHOOD TO IMPROVE MODEL ROBUSTNESS
  step_other(all_nominal_predictors(), threshold = 0.05, other = "OTHER") %>%
  # STABILIZING VARIANCE AND NORMALIZING DISTRIBUTIONS FOR LOT_AREA AND GR_LIV_AREA
  step_YeoJohnson(all_numeric_predictors()) %>% 
  # NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
  step_normalize(all_numeric_predictors()) %>%
  # CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  # REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
  step_zv(all_predictors())

# PCA RECIPE
pca_rec <- recipe(sale_price_log ~ ., data = ames_train) %>% 
  # FOR UNSEEN FACTROR LEVELS, CREATE A NEW LEVEL CALLED "NEW"
  step_novel(all_nominal_predictors()) %>%
  # CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
  step_dummy(all_nominal_predictors()) %>%
  # REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
  step_zv(all_predictors()) %>%
  # NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
  step_normalize(all_numeric_predictors()) %>%
  # CONVERT NUMERIC COLUMNS TO PRINCIPAL COMPONENTS
  step_pca(all_predictors(), threshold = 0.95)


# # THERE ARE 309 COLUMNS IN THE BASE RECIPE
# base_rec %>%
#   prep() %>%
#   juice() %>%
#   ncol()
# 
# z <- normalise_rec %>%
#   prep() %>%
#   juice() %>%
#   ncol()
# 
# pca_rec %>%
#   prep() %>%
#   juice() %>%
#   ncol()
# 
# # THERE ARE 297 COLUMNS IN THE NORMALISED RECIPE
# pca_rec %>%
#   prep() %>%
#   juice() %>%
#   ncol()

# BUILD MODELS -----------------------------------------------------------

# DEFINE A BAGGED RANDOM FOREST MODEL
bagged_spec <- bag_tree(
  tree_depth = tune(),
  min_n = tune(),
  cost_complexity = tune()
) %>%
  set_mode("regression") %>%
  set_engine("rpart", times = 25L)

# DEFINE A RANGER RANDOM FOREST MODEL
rf_spec <-
  rand_forest(
    mtry = tune(),
    min_n = tune(),
    trees = 500
  ) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# DEFINE AN XGBOOST MODEL
xgb_spec <- boost_tree(
  trees = 500,
  tree_depth = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  mtry = tune(),
  learn_rate = tune()
) %>%
  set_engine("xgboost", importance = TRUE) %>%
  set_mode("regression")

# DEFINE A BOOSTED TREE ENSEMBLE MODEL
bt_spec <-
  boost_tree(
    learn_rate = tune(),
    stop_iter = tune(),
    trees = 500
  ) %>%
  set_engine("lightgbm", num_leaves = tune()) %>%
  set_mode("regression")

# DEFINE A WORKFLOW SET ---------------------------------------------------

wflw_set <-
  workflow_set(
    preproc = list(base = base_rec, normalise = normalise_rec, pca = pca_rec),
    models = list(xgb = xgb_spec, bagged = bagged_spec, rf = rf_spec, bt = bt_spec),
    cross = TRUE
  )

# UPDATE MTRY PARAMETER FOR THE BASE XGBOOST
base_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "base_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 308)))

base_rf_param <- wflw_set %>% 
  extract_workflow(
    id = "base_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 308)))

# UPDATE MTRY PARAMETER FOR THE NORMALISED XGB MODEL
normalise_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "normalise_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 284)))

# UPDATE MTRY PARAMETER FOR THE NORMALISED RF MODEL
normalise_rf_param <- wflw_set %>%
  extract_workflow(
    id = "normalise_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 284)))

# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_xgb_param <- wflw_set %>%
  extract_workflow(
    id = "pca_xgb"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 5)))

# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_rf_param <- wflw_set %>%
  extract_workflow(
    id = "pca_rf"
  ) %>%
  hardhat::extract_parameter_set_dials() %>%
  update(mtry = mtry(c(1, 5)))

# UPDATE THE WORKFLOW SET WITH THE NEW PARAMETERS
wf_set_tune_list_finalize <- wflw_set %>%
  option_add(param_info = base_xgb_param, id = "base_xgb") %>%
  option_add(param_info = base_rf_param, id = "base_rf") %>%
  option_add(param_info = normalise_xgb_param, id = "normalise_xgb") %>%
  option_add(param_info = normalise_rf_param, id = "normalise_rf") %>%
  option_add(param_info = pca_xgb_param, id = "pca_xgb") %>%
  option_add(param_info = pca_rf_param, id = "pca_rf")

# SPECIFY THE TUNE GRID
race_ctrl <-
  control_race(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

# DETECT THE NUMBER OF CORES
cores <- parallel::detectCores(logical = FALSE)

# CREATE A SET OF COPIES OF R RUNNING IN PARALLEL AND COMMUNICATING VIA SOCKETS
cl <- makePSOCKcluster(cores)

# REGISTER THE PARALLEL BACKEND
doParallel::registerDoParallel(cores = cl)

# APPLY RACE ANOVA TUNING TO EACH WORKFLOW IN THE WORKFLOW SET
tictoc::tic()
race_results <- wf_set_tune_list_finalize %>%
  workflow_map(
    "tune_race_anova",
    seed = 123,
    resamples = ames_folds,
    grid = 5,
    control = race_ctrl,
    verbose = TRUE
  )
tictoc::toc()

# EXTRACT THE BEST RESULTS
best_results <- 
  race_results %>% 
  extract_workflow_set_result("base_xgb") %>% 
  select_best(metric = "rmse")

Solution

  • When you say "extract the hyperparameters," I'm assuming you mean that you want to find the hyperparameter values associated with each of the performance metrics for the workflow set result.

    Note that, in a workflow set, each of the different tuning results could have different tuning parameters associated with it, so if we want to collect the tuning parameters from all of the results, with a column for each tuning parameter, the values in those columns would be missing for any tuning result that doesn't make use of a given tuning parameter; this is why we don't include those columns by default in collect_metrics() on a workflow set.

    Your best_results object contains the tuning parameter values for that best combination. To find the hyperparameter values for all objects, you could bind the output of collect_metrics() together, filling in missing values for mismatched columns. Here's an example with the built-in chi_features_res object:

    library(tidymodels)
    
    chi_features_res
    #> # A workflow set/tibble: 3 × 4
    #>   wflow_id         info             option    result   
    #>   <chr>            <list>           <list>    <list>   
    #> 1 date_lm          <tibble [1 × 4]> <opts[2]> <rsmp[+]>
    #> 2 plus_holidays_lm <tibble [1 × 4]> <opts[2]> <rsmp[+]>
    #> 3 plus_pca_lm      <tibble [1 × 4]> <opts[3]> <tune[+]>
    
    chi_features_res %>%
      # collect metrics for each result
      rowwise() %>%
      mutate(metrics = list(collect_metrics(result))) %>%
      pull(metrics) %>%
      # bind rows
      bind_rows()
    #> # A tibble: 40 × 7
    #>    .metric .estimator  mean     n std_err .config               num_comp
    #>    <chr>   <chr>      <dbl> <int>   <dbl> <chr>                    <int>
    #>  1 rmse    standard   0.733     1      NA Preprocessor1_Model1        NA
    #>  2 rsq     standard   0.982     1      NA Preprocessor1_Model1        NA
    #>  3 rmse    standard   0.646     1      NA Preprocessor1_Model1        NA
    #>  4 rsq     standard   0.986     1      NA Preprocessor1_Model1        NA
    #>  5 rmse    standard   0.609     1      NA Preprocessor01_Model1       15
    #>  6 rsq     standard   0.987     1      NA Preprocessor01_Model1       15
    #>  7 rmse    standard   0.642     1      NA Preprocessor02_Model1       18
    #>  8 rsq     standard   0.986     1      NA Preprocessor02_Model1       18
    #>  9 rmse    standard   0.586     1      NA Preprocessor03_Model1        3
    #> 10 rsq     standard   0.989     1      NA Preprocessor03_Model1        3
    #> # ℹ 30 more rows
    

    Created on 2024-01-08 with reprex v2.0.2

    You would substitute race_results for chi_features_res. :)