Tidymodels and DALEX Error: The class of model must have a model_type method

The following code illustrates a tuned xgboost with mtcars data. I want to work with surogate models using DALEX later on.

# Load required packages
library(tidymodels)
library(xgboost)
library(DALEX)
library(DALEXtra)

# Load example data
data(mtcars)

# Create a recipe for preprocessing
recipe_obj <- recipe(mpg ~ ., data = mtcars) %>%
  step_normalize(all_predictors()) %>%
  step_dummy(all_nominal(), one_hot = TRUE)

# Split data into training and testing sets
set.seed(123)
data_split <- initial_split(mtcars, prop = 0.8, strata = "mpg")
train_data <- training(data_split)
test_data <- testing(data_split)

# Specify model and parameter ranges
xgb_model <- boost_tree(
  trees = tune(),
  mtry = tune(),
  min_n = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# Create a model workflow
xgb_workflow <- workflow() %>%
  add_recipe(recipe_obj) %>%
  add_model(xgb_model)

# Define a parameter grid for tuning
xgb_grid <- grid_latin_hypercube(
  trees(range = c(50, 500)),
  finalize(mtry(), train_data),
  min_n(range = c(2, 10)),
  size = 20
)

# Tune the model
set.seed(123)
xgb_tuned <- xgb_workflow %>%
  tune_grid(
    resamples = vfold_cv(train_data, v = 5),
    grid = xgb_grid,
    metrics = metric_set(rmse)
  )

# Select the best model
best_xgb <- xgb_tuned %>%
  select_best("rmse")

# Finalize the workflow with the best model
final_workflow <- xgb_workflow %>%
  finalize_workflow(best_xgb)

# Fit the final workflow to the training data
final_fit <- final_workflow %>% fit(data = train_data)

# Extract the fitted model from the workflow
fitted_model <- extract_fit_engine(final_fit)

Now I would like to use DALEX

# Preprocess the training data using the recipe
prepped_recipe <- prep(recipe_obj, training = train_data)
baked_train_data <- bake(prepped_recipe, new_data = train_data)

# Define the model type for the xgboost model
model_type.xgb.Booster <- function(x, ...) "regression"

# Build a DALEX explainer
explainer <- DALEX::explain(
  model = fitted_model,
  data = train_data,
  y = train_data$mpg,
  label = "XGBoost"
)

# Check what is in an explainer
print(class(explainer))

# Choose a specific instance for local explanation
instance_index <- 1
instance <- test_data[instance_index, , drop = FALSE]

# Ensure the instance is preprocessed using the recipe
baked_instance <- bake(prepped_recipe, new_data = instance)

# Create a surrogate decision tree model using LIME
surrogate_model <- predict_surrogate(
  explainer = explainer,
  new_observation = baked_instance,
  n_features = 3, 
  n_permutations = 1000,
  type = "lime"
)

# Plot the surrogate model
plot(surrogate_model)

I get an error:

Error: The class of model must have a model_type method. See ?model_type to get an overview of models supported out of the box
In addition: Warning messages:
1: vs does not contain enough variance to use quantile binning. Using standard binning instead. 
2: am does not contain enough variance to use quantile binning. Using standard binning instead

How to make sure a model_type method is specified and my script works? I have read the ?model_type and I am not sure where and how I should specify it.

Solution

When creating your model explainer with DALEX::explain() pass the final_fit workflow object.

Unchanged

# Load required packages
library(tidymodels)
library(xgboost)
library(DALEX)
library(DALEXtra)

# Load example data
data(mtcars)

# Create a recipe for preprocessing
recipe_obj <- recipe(mpg ~ ., data = mtcars) %>%
  step_normalize(all_predictors()) %>%
  step_dummy(all_nominal(), one_hot = TRUE)

# Split data into training and testing sets
set.seed(123)
data_split <- initial_split(mtcars, prop = 0.8, strata = "mpg")
#> Warning: The number of observations in each quantile is below the recommended threshold of 20.
#> • Stratification will use 1 breaks instead.
#> Warning: Too little data to stratify.
#> • Resampling will be unstratified.
train_data <- training(data_split)
test_data <- testing(data_split)

# Specify model and parameter ranges
xgb_model <- boost_tree(
  trees = tune(),
  mtry = tune(),
  min_n = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# Create a model workflow
xgb_workflow <- workflow() %>%
  add_recipe(recipe_obj) %>%
  add_model(xgb_model)

# Define a parameter grid for tuning
xgb_grid <- grid_latin_hypercube(
  trees(range = c(50, 500)),
  finalize(mtry(), train_data),
  min_n(range = c(2, 10)),
  size = 20
)

# Tune the model
set.seed(123)
xgb_tuned <- xgb_workflow %>%
  tune_grid(
    resamples = vfold_cv(train_data, v = 5),
    grid = xgb_grid,
    metrics = metric_set(rmse)
  )

Name the `metric` argument in `select_best`

# Select the best model
best_xgb <- xgb_tuned %>%
  select_best(metric = "rmse")

# Finalize the workflow with the best model
final_workflow <- xgb_workflow %>%
  finalize_workflow(best_xgb)

Construct the explainer with the fitted workflow.

# Fit the final workflow to the training data
final_fit <- final_workflow %>% fit(data = train_data)

prepped_recipe <- prep(recipe_obj, training = train_data)

# Build a DALEX explainer
explainer <- DALEX::explain(
  model = final_fit,
  data = train_data,
  y = train_data$mpg,
  label = "XGBoost"
)
#> Preparation of a new explainer is initiated
#>   -> model label       :  XGBoost 
#>   -> data              :  25  rows  11  cols 
#>   -> target variable   :  25  values 
#>   -> predict function  :  yhat.workflow  will be used (  default  )
#>   -> predicted values  :  No value for predict function target column. (  default  )
#>   -> model_info        :  package tidymodels , ver. 1.2.0 , task regression (  default  ) 
#>   -> predicted values  :  numerical, min =  10.40073 , mean =  20.76 , max =  33.89691  
#>   -> residual function :  difference between y and yhat (  default  )
#>   -> residuals         :  numerical, min =  -0.001708603 , mean =  -1.64032e-06 , max =  0.003088379  
#>   A new explainer has been created!

# Choose a specific instance for local explanation
instance_index <- 1
instance <- test_data[instance_index, , drop = FALSE]

# Ensure the instance is preprocessed using the recipe
baked_instance <- bake(prepped_recipe, new_data = instance)

Add `DALEXtra` functions to environment and create the surrogate model

model_type.dalex_explainer <- DALEXtra::model_type.dalex_explainer
predict_model.dalex_explainer <- DALEXtra::predict_model.dalex_explainer

# Create a surrogate decision tree model using LIME
surrogate_model <- predict_surrogate(
  explainer = explainer,
  new_observation = baked_instance,
  n_features = 3, 
  n_permutations = 1000,
  type = 'lime'
)
#> Warning: vs does not contain enough variance to use quantile binning. Using
#> standard binning instead.
#> Warning: am does not contain enough variance to use quantile binning. Using
#> standard binning instead.

# Plot the surrogate model
plot(surrogate_model)

^{Created on 2024-06-05 with reprex v2.1.0}

Tidymodels and DALEX Error: The class of model must have a model_type method

Unchanged

Name the metric argument in select_best

Construct the explainer with the fitted workflow.

Add DALEXtra functions to environment and create the surrogate model

Name the `metric` argument in `select_best`

Add `DALEXtra` functions to environment and create the surrogate model