I want to compute the FIRM importance scores for a model made from a tidymodels workflow. For regex, I will use the iris dataset and try to predict whether an observation is setosa or not.
library(tidymodels)
library(readr)
library(vip)
#clean data
iris <- iris %>%
mutate(class = case_when(Species == 'setosa' ~ 'setosa',
TRUE ~ 'other'))
iris$class = as.factor(iris$class)
iris <- subset(iris, select = -c(Species))
#split data into training and testing
iris_split = initial_split(iris, prop = 0.8)
cv_splits = vfold_cv(training(iris_split), v = 5)
#preprocessing
iris_recipe = recipe(class ~., data = iris) %>%
step_center(Sepal.Length) %>%
prep()
#specify MARS model
model = rand_forest(
mode = "classification",
mtry = tune(),
trees = 50
) %>%
set_engine("ranger", importance = "impurity")
#tuning parameters
tuning_grid = grid_regular(mtry(range=c(1,4)), levels = 4)
iris_wkfl = workflow() %>%
add_recipe(iris_recipe) %>%
add_model(model)
iris_tune = tune_grid(iris_wkfl,
resamples = cv_splits,
grid = tuning_grid,
metrics = metric_set(accuracy))
best_params = iris_tune %>%
select_best(metric = "accuracy")
best_model = finalize_workflow(iris_wkfl, best_params) %>%
parsnip::fit(data = training(iris_split)) %>%
pull_workflow_fit()
vip(best_model, method = "firm")
The last line produces an error from the pdp package.
Error in get_training_data.default(object) :
The training data could not be extracted from object. Please supply the raw training data using the train
argument in the call to partial
.
Is the following line correct? Or do I need to supply for transformed training data using my recipe first? I want to make sure that vip is applying my recipe when computing the importance scores. I know the error says "raw training data" but I am unsure if pdp knows about my workflow.
vip(best_model, method = "firm", train = training(iris_split))
You'll want to take the same approach that I outlined in this answer.
Start by tuning and then training your model on the training data:
library(tidymodels)
#clean data
iris = iris %>%
mutate(class = case_when(Species == 'setosa' ~ 'setosa',
TRUE ~ 'other'),
class = factor(class)) %>%
select(-Species)
#split data into training and testing
iris_split = initial_split(iris, prop = 0.8)
iris_train = training(iris_split)
iris_test = testing(iris_split)
cv_splits = vfold_cv(iris_train, v = 5)
#preprocessing
iris_recipe = recipe(class ~., data = iris_train) %>%
step_center(Sepal.Length)
#specify ranger model
rf_spec = rand_forest(
mode = "classification",
mtry = tune(),
trees = 50
) %>%
set_engine("ranger", importance = "impurity")
## don't need any importance here if you will do it another way; probably remove
#tuning parameters
tuning_grid = grid_regular(mtry(range=c(1,4)), levels = 4)
iris_wkfl = workflow() %>%
add_recipe(iris_recipe) %>%
add_model(rf_spec)
iris_tune = tune_grid(iris_wkfl,
resamples = cv_splits,
grid = tuning_grid,
metrics = metric_set(accuracy))
#>
#> Attaching package: 'rlang'
#> The following objects are masked from 'package:purrr':
#>
#> %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
#> flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
#> splice
#>
#> Attaching package: 'vctrs'
#> The following object is masked from 'package:tibble':
#>
#> data_frame
#> The following object is masked from 'package:dplyr':
#>
#> data_frame
best_params = iris_tune %>%
select_best(metric = "accuracy")
rf_fit = finalize_workflow(iris_wkfl, best_params) %>%
fit(data = iris_train)
Your model is now trained and you can compute model-independent variable importance scores like FIRM. There are a couple of steps:
pull()
the fitted model out of the workflow.class
.predict()
but unfortunately for ranger it is predictions()
).library(vip)
#>
#> Attaching package: 'vip'
#> The following object is masked from 'package:utils':
#>
#> vi
rf_fit %>%
pull_workflow_fit() %>%
vip(method = "firm",
target = "class", metric = "accuracy",
pred_wrapper = ranger::predictions,
train = bake(prep(iris_recipe), new_data = NULL))
Created on 2020-12-10 by the reprex package (v0.3.0.9001)