I wish to plot the hyper-parameter performance (RMSE and RSQ) from a workflowset in Tidymodels however I'm struggling to piece together the syntax.
I'm trying to replicate the plot below, taken from here:
How can I extract the hyperparameters from my race results?
# load the housing data and clean names
ames_data <- make_ames() %>%
janitor::clean_names() %>%
mutate(sale_price_log = log10(sale_price))
# SPLIT INTO TRAINING AND TESTING DATA. STRATIFY BY SALE PRICE
ames_split <- rsample::initial_split(
ames_data %>% select(-sale_price),
prop = 0.8,
strata = sale_price_log
)
# CREATE TRAINING AND TESTING OBJECTS FROM THE SPLIT OBJECT
ames_train <- training(ames_split)
ames_test <- testing(ames_split)
# CREATE RESAMPLES TO CHOOSE AND COMPARE MODELS
set.seed(234)
ames_folds <- vfold_cv(ames_train, strata = sale_price_log, v = 5)
# DEFINE PREPROCESSING RECIPES --------------------------------------------
base_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# APPLYING LOG TRANSFORMATION TO SALE_PRICE AND GR_LIV_AREA TO ADDRESS SKEWNESS
step_log(gr_liv_area, base = 10) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors(), one_hot = TRUE)
normalise_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_nzv(all_nominal_predictors()) %>%
# HANDLING RARE FACTOR LEVELS IN NEIGHBORHOOD TO IMPROVE MODEL ROBUSTNESS
step_other(all_nominal_predictors(), threshold = 0.05, other = "OTHER") %>%
# STABILIZING VARIANCE AND NORMALIZING DISTRIBUTIONS FOR LOT_AREA AND GR_LIV_AREA
step_YeoJohnson(all_numeric_predictors()) %>%
# NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
step_normalize(all_numeric_predictors()) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_zv(all_predictors())
# PCA RECIPE
pca_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# FOR UNSEEN FACTROR LEVELS, CREATE A NEW LEVEL CALLED "NEW"
step_novel(all_nominal_predictors()) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors()) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_zv(all_predictors()) %>%
# NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
step_normalize(all_numeric_predictors()) %>%
# CONVERT NUMERIC COLUMNS TO PRINCIPAL COMPONENTS
step_pca(all_predictors(), threshold = 0.95)
# # THERE ARE 309 COLUMNS IN THE BASE RECIPE
# base_rec %>%
# prep() %>%
# juice() %>%
# ncol()
#
# z <- normalise_rec %>%
# prep() %>%
# juice() %>%
# ncol()
#
# pca_rec %>%
# prep() %>%
# juice() %>%
# ncol()
#
# # THERE ARE 297 COLUMNS IN THE NORMALISED RECIPE
# pca_rec %>%
# prep() %>%
# juice() %>%
# ncol()
# BUILD MODELS -----------------------------------------------------------
# DEFINE A BAGGED RANDOM FOREST MODEL
bagged_spec <- bag_tree(
tree_depth = tune(),
min_n = tune(),
cost_complexity = tune()
) %>%
set_mode("regression") %>%
set_engine("rpart", times = 25L)
# DEFINE A RANGER RANDOM FOREST MODEL
rf_spec <-
rand_forest(
mtry = tune(),
min_n = tune(),
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
# DEFINE AN XGBOOST MODEL
xgb_spec <- boost_tree(
trees = 500,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost", importance = TRUE) %>%
set_mode("regression")
# DEFINE A BOOSTED TREE ENSEMBLE MODEL
bt_spec <-
boost_tree(
learn_rate = tune(),
stop_iter = tune(),
trees = 500
) %>%
set_engine("lightgbm", num_leaves = tune()) %>%
set_mode("regression")
# DEFINE A WORKFLOW SET ---------------------------------------------------
wflw_set <-
workflow_set(
preproc = list(base = base_rec, normalise = normalise_rec, pca = pca_rec),
models = list(xgb = xgb_spec, bagged = bagged_spec, rf = rf_spec, bt = bt_spec),
cross = TRUE
)
# UPDATE MTRY PARAMETER FOR THE BASE XGBOOST
base_xgb_param <- wflw_set %>%
extract_workflow(
id = "base_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 308)))
base_rf_param <- wflw_set %>%
extract_workflow(
id = "base_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 308)))
# UPDATE MTRY PARAMETER FOR THE NORMALISED XGB MODEL
normalise_xgb_param <- wflw_set %>%
extract_workflow(
id = "normalise_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 284)))
# UPDATE MTRY PARAMETER FOR THE NORMALISED RF MODEL
normalise_rf_param <- wflw_set %>%
extract_workflow(
id = "normalise_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 284)))
# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_xgb_param <- wflw_set %>%
extract_workflow(
id = "pca_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 5)))
# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_rf_param <- wflw_set %>%
extract_workflow(
id = "pca_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 5)))
# UPDATE THE WORKFLOW SET WITH THE NEW PARAMETERS
wf_set_tune_list_finalize <- wflw_set %>%
option_add(param_info = base_xgb_param, id = "base_xgb") %>%
option_add(param_info = base_rf_param, id = "base_rf") %>%
option_add(param_info = normalise_xgb_param, id = "normalise_xgb") %>%
option_add(param_info = normalise_rf_param, id = "normalise_rf") %>%
option_add(param_info = pca_xgb_param, id = "pca_xgb") %>%
option_add(param_info = pca_rf_param, id = "pca_rf")
# SPECIFY THE TUNE GRID
race_ctrl <-
control_race(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
# DETECT THE NUMBER OF CORES
cores <- parallel::detectCores(logical = FALSE)
# CREATE A SET OF COPIES OF R RUNNING IN PARALLEL AND COMMUNICATING VIA SOCKETS
cl <- makePSOCKcluster(cores)
# REGISTER THE PARALLEL BACKEND
doParallel::registerDoParallel(cores = cl)
# APPLY RACE ANOVA TUNING TO EACH WORKFLOW IN THE WORKFLOW SET
tictoc::tic()
race_results <- wf_set_tune_list_finalize %>%
workflow_map(
"tune_race_anova",
seed = 123,
resamples = ames_folds,
grid = 5,
control = race_ctrl,
verbose = TRUE
)
tictoc::toc()
# EXTRACT THE BEST RESULTS
best_results <-
race_results %>%
extract_workflow_set_result("base_xgb") %>%
select_best(metric = "rmse")
When you say "extract the hyperparameters," I'm assuming you mean that you want to find the hyperparameter values associated with each of the performance metrics for the workflow set result.
Note that, in a workflow set, each of the different tuning results could have different tuning parameters associated with it, so if we want to collect the tuning parameters from all of the results, with a column for each tuning parameter, the values in those columns would be missing for any tuning result that doesn't make use of a given tuning parameter; this is why we don't include those columns by default in collect_metrics()
on a workflow set.
Your best_results
object contains the tuning parameter values for that best combination. To find the hyperparameter values for all objects, you could bind the output of collect_metrics()
together, filling in missing values for mismatched columns. Here's an example with the built-in chi_features_res
object:
library(tidymodels)
chi_features_res
#> # A workflow set/tibble: 3 × 4
#> wflow_id info option result
#> <chr> <list> <list> <list>
#> 1 date_lm <tibble [1 × 4]> <opts[2]> <rsmp[+]>
#> 2 plus_holidays_lm <tibble [1 × 4]> <opts[2]> <rsmp[+]>
#> 3 plus_pca_lm <tibble [1 × 4]> <opts[3]> <tune[+]>
chi_features_res %>%
# collect metrics for each result
rowwise() %>%
mutate(metrics = list(collect_metrics(result))) %>%
pull(metrics) %>%
# bind rows
bind_rows()
#> # A tibble: 40 × 7
#> .metric .estimator mean n std_err .config num_comp
#> <chr> <chr> <dbl> <int> <dbl> <chr> <int>
#> 1 rmse standard 0.733 1 NA Preprocessor1_Model1 NA
#> 2 rsq standard 0.982 1 NA Preprocessor1_Model1 NA
#> 3 rmse standard 0.646 1 NA Preprocessor1_Model1 NA
#> 4 rsq standard 0.986 1 NA Preprocessor1_Model1 NA
#> 5 rmse standard 0.609 1 NA Preprocessor01_Model1 15
#> 6 rsq standard 0.987 1 NA Preprocessor01_Model1 15
#> 7 rmse standard 0.642 1 NA Preprocessor02_Model1 18
#> 8 rsq standard 0.986 1 NA Preprocessor02_Model1 18
#> 9 rmse standard 0.586 1 NA Preprocessor03_Model1 3
#> 10 rsq standard 0.989 1 NA Preprocessor03_Model1 3
#> # ℹ 30 more rows
Created on 2024-01-08 with reprex v2.0.2
You would substitute race_results
for chi_features_res
. :)