The following code illustrates a tuned xgboost with mtcars data. I want to work with surogate models using DALEX later on.
# Load required packages
library(tidymodels)
library(xgboost)
library(DALEX)
library(DALEXtra)
# Load example data
data(mtcars)
# Create a recipe for preprocessing
recipe_obj <- recipe(mpg ~ ., data = mtcars) %>%
step_normalize(all_predictors()) %>%
step_dummy(all_nominal(), one_hot = TRUE)
# Split data into training and testing sets
set.seed(123)
data_split <- initial_split(mtcars, prop = 0.8, strata = "mpg")
train_data <- training(data_split)
test_data <- testing(data_split)
# Specify model and parameter ranges
xgb_model <- boost_tree(
trees = tune(),
mtry = tune(),
min_n = tune()
) %>%
set_engine("xgboost") %>%
set_mode("regression")
# Create a model workflow
xgb_workflow <- workflow() %>%
add_recipe(recipe_obj) %>%
add_model(xgb_model)
# Define a parameter grid for tuning
xgb_grid <- grid_latin_hypercube(
trees(range = c(50, 500)),
finalize(mtry(), train_data),
min_n(range = c(2, 10)),
size = 20
)
# Tune the model
set.seed(123)
xgb_tuned <- xgb_workflow %>%
tune_grid(
resamples = vfold_cv(train_data, v = 5),
grid = xgb_grid,
metrics = metric_set(rmse)
)
# Select the best model
best_xgb <- xgb_tuned %>%
select_best("rmse")
# Finalize the workflow with the best model
final_workflow <- xgb_workflow %>%
finalize_workflow(best_xgb)
# Fit the final workflow to the training data
final_fit <- final_workflow %>% fit(data = train_data)
# Extract the fitted model from the workflow
fitted_model <- extract_fit_engine(final_fit)
Now I would like to use DALEX
# Preprocess the training data using the recipe
prepped_recipe <- prep(recipe_obj, training = train_data)
baked_train_data <- bake(prepped_recipe, new_data = train_data)
# Define the model type for the xgboost model
model_type.xgb.Booster <- function(x, ...) "regression"
# Build a DALEX explainer
explainer <- DALEX::explain(
model = fitted_model,
data = train_data,
y = train_data$mpg,
label = "XGBoost"
)
# Check what is in an explainer
print(class(explainer))
# Choose a specific instance for local explanation
instance_index <- 1
instance <- test_data[instance_index, , drop = FALSE]
# Ensure the instance is preprocessed using the recipe
baked_instance <- bake(prepped_recipe, new_data = instance)
# Create a surrogate decision tree model using LIME
surrogate_model <- predict_surrogate(
explainer = explainer,
new_observation = baked_instance,
n_features = 3,
n_permutations = 1000,
type = "lime"
)
# Plot the surrogate model
plot(surrogate_model)
I get an error:
Error: The class of model must have a model_type method. See ?model_type to get an overview of models supported out of the box
In addition: Warning messages:
1: vs does not contain enough variance to use quantile binning. Using standard binning instead.
2: am does not contain enough variance to use quantile binning. Using standard binning instead
How to make sure a model_type method is specified and my script works? I have read the ?model_type and I am not sure where and how I should specify it.
When creating your model explainer with DALEX::explain()
pass the final_fit
workflow object.
# Load required packages
library(tidymodels)
library(xgboost)
library(DALEX)
library(DALEXtra)
# Load example data
data(mtcars)
# Create a recipe for preprocessing
recipe_obj <- recipe(mpg ~ ., data = mtcars) %>%
step_normalize(all_predictors()) %>%
step_dummy(all_nominal(), one_hot = TRUE)
# Split data into training and testing sets
set.seed(123)
data_split <- initial_split(mtcars, prop = 0.8, strata = "mpg")
#> Warning: The number of observations in each quantile is below the recommended threshold of 20.
#> • Stratification will use 1 breaks instead.
#> Warning: Too little data to stratify.
#> • Resampling will be unstratified.
train_data <- training(data_split)
test_data <- testing(data_split)
# Specify model and parameter ranges
xgb_model <- boost_tree(
trees = tune(),
mtry = tune(),
min_n = tune()
) %>%
set_engine("xgboost") %>%
set_mode("regression")
# Create a model workflow
xgb_workflow <- workflow() %>%
add_recipe(recipe_obj) %>%
add_model(xgb_model)
# Define a parameter grid for tuning
xgb_grid <- grid_latin_hypercube(
trees(range = c(50, 500)),
finalize(mtry(), train_data),
min_n(range = c(2, 10)),
size = 20
)
# Tune the model
set.seed(123)
xgb_tuned <- xgb_workflow %>%
tune_grid(
resamples = vfold_cv(train_data, v = 5),
grid = xgb_grid,
metrics = metric_set(rmse)
)
metric
argument in select_best
# Select the best model
best_xgb <- xgb_tuned %>%
select_best(metric = "rmse")
# Finalize the workflow with the best model
final_workflow <- xgb_workflow %>%
finalize_workflow(best_xgb)
# Fit the final workflow to the training data
final_fit <- final_workflow %>% fit(data = train_data)
prepped_recipe <- prep(recipe_obj, training = train_data)
# Build a DALEX explainer
explainer <- DALEX::explain(
model = final_fit,
data = train_data,
y = train_data$mpg,
label = "XGBoost"
)
#> Preparation of a new explainer is initiated
#> -> model label : XGBoost
#> -> data : 25 rows 11 cols
#> -> target variable : 25 values
#> -> predict function : yhat.workflow will be used ( default )
#> -> predicted values : No value for predict function target column. ( default )
#> -> model_info : package tidymodels , ver. 1.2.0 , task regression ( default )
#> -> predicted values : numerical, min = 10.40073 , mean = 20.76 , max = 33.89691
#> -> residual function : difference between y and yhat ( default )
#> -> residuals : numerical, min = -0.001708603 , mean = -1.64032e-06 , max = 0.003088379
#> A new explainer has been created!
# Choose a specific instance for local explanation
instance_index <- 1
instance <- test_data[instance_index, , drop = FALSE]
# Ensure the instance is preprocessed using the recipe
baked_instance <- bake(prepped_recipe, new_data = instance)
DALEXtra
functions to environment and create the surrogate modelmodel_type.dalex_explainer <- DALEXtra::model_type.dalex_explainer
predict_model.dalex_explainer <- DALEXtra::predict_model.dalex_explainer
# Create a surrogate decision tree model using LIME
surrogate_model <- predict_surrogate(
explainer = explainer,
new_observation = baked_instance,
n_features = 3,
n_permutations = 1000,
type = 'lime'
)
#> Warning: vs does not contain enough variance to use quantile binning. Using
#> standard binning instead.
#> Warning: am does not contain enough variance to use quantile binning. Using
#> standard binning instead.
# Plot the surrogate model
plot(surrogate_model)
Created on 2024-06-05 with reprex v2.1.0