rtidyversexgboosttidymodelsdalex

Tidymodels and DALEX Error: The class of model must have a model_type method


The following code illustrates a tuned xgboost with mtcars data. I want to work with surogate models using DALEX later on.

# Load required packages
library(tidymodels)
library(xgboost)
library(DALEX)
library(DALEXtra)

# Load example data
data(mtcars)

# Create a recipe for preprocessing
recipe_obj <- recipe(mpg ~ ., data = mtcars) %>%
  step_normalize(all_predictors()) %>%
  step_dummy(all_nominal(), one_hot = TRUE)

# Split data into training and testing sets
set.seed(123)
data_split <- initial_split(mtcars, prop = 0.8, strata = "mpg")
train_data <- training(data_split)
test_data <- testing(data_split)

# Specify model and parameter ranges
xgb_model <- boost_tree(
  trees = tune(),
  mtry = tune(),
  min_n = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# Create a model workflow
xgb_workflow <- workflow() %>%
  add_recipe(recipe_obj) %>%
  add_model(xgb_model)

# Define a parameter grid for tuning
xgb_grid <- grid_latin_hypercube(
  trees(range = c(50, 500)),
  finalize(mtry(), train_data),
  min_n(range = c(2, 10)),
  size = 20
)

# Tune the model
set.seed(123)
xgb_tuned <- xgb_workflow %>%
  tune_grid(
    resamples = vfold_cv(train_data, v = 5),
    grid = xgb_grid,
    metrics = metric_set(rmse)
  )

# Select the best model
best_xgb <- xgb_tuned %>%
  select_best("rmse")

# Finalize the workflow with the best model
final_workflow <- xgb_workflow %>%
  finalize_workflow(best_xgb)

# Fit the final workflow to the training data
final_fit <- final_workflow %>% fit(data = train_data)

# Extract the fitted model from the workflow
fitted_model <- extract_fit_engine(final_fit)

Now I would like to use DALEX

# Preprocess the training data using the recipe
prepped_recipe <- prep(recipe_obj, training = train_data)
baked_train_data <- bake(prepped_recipe, new_data = train_data)

# Define the model type for the xgboost model
model_type.xgb.Booster <- function(x, ...) "regression"

# Build a DALEX explainer
explainer <- DALEX::explain(
  model = fitted_model,
  data = train_data,
  y = train_data$mpg,
  label = "XGBoost"
)

# Check what is in an explainer
print(class(explainer))

# Choose a specific instance for local explanation
instance_index <- 1
instance <- test_data[instance_index, , drop = FALSE]

# Ensure the instance is preprocessed using the recipe
baked_instance <- bake(prepped_recipe, new_data = instance)

# Create a surrogate decision tree model using LIME
surrogate_model <- predict_surrogate(
  explainer = explainer,
  new_observation = baked_instance,
  n_features = 3, 
  n_permutations = 1000,
  type = "lime"
)

# Plot the surrogate model
plot(surrogate_model)

I get an error:

Error: The class of model must have a model_type method. See ?model_type to get an overview of models supported out of the box
In addition: Warning messages:
1: vs does not contain enough variance to use quantile binning. Using standard binning instead. 
2: am does not contain enough variance to use quantile binning. Using standard binning instead

How to make sure a model_type method is specified and my script works? I have read the ?model_type and I am not sure where and how I should specify it.


Solution

  • When creating your model explainer with DALEX::explain() pass the final_fit workflow object.

    Unchanged

    # Load required packages
    library(tidymodels)
    library(xgboost)
    library(DALEX)
    library(DALEXtra)
    
    # Load example data
    data(mtcars)
    
    # Create a recipe for preprocessing
    recipe_obj <- recipe(mpg ~ ., data = mtcars) %>%
      step_normalize(all_predictors()) %>%
      step_dummy(all_nominal(), one_hot = TRUE)
    
    # Split data into training and testing sets
    set.seed(123)
    data_split <- initial_split(mtcars, prop = 0.8, strata = "mpg")
    #> Warning: The number of observations in each quantile is below the recommended threshold of 20.
    #> • Stratification will use 1 breaks instead.
    #> Warning: Too little data to stratify.
    #> • Resampling will be unstratified.
    train_data <- training(data_split)
    test_data <- testing(data_split)
    
    # Specify model and parameter ranges
    xgb_model <- boost_tree(
      trees = tune(),
      mtry = tune(),
      min_n = tune()
    ) %>%
      set_engine("xgboost") %>%
      set_mode("regression")
    
    # Create a model workflow
    xgb_workflow <- workflow() %>%
      add_recipe(recipe_obj) %>%
      add_model(xgb_model)
    
    # Define a parameter grid for tuning
    xgb_grid <- grid_latin_hypercube(
      trees(range = c(50, 500)),
      finalize(mtry(), train_data),
      min_n(range = c(2, 10)),
      size = 20
    )
    
    # Tune the model
    set.seed(123)
    xgb_tuned <- xgb_workflow %>%
      tune_grid(
        resamples = vfold_cv(train_data, v = 5),
        grid = xgb_grid,
        metrics = metric_set(rmse)
      )
    

    Name the metric argument in select_best

    # Select the best model
    best_xgb <- xgb_tuned %>%
      select_best(metric = "rmse")
    
    # Finalize the workflow with the best model
    final_workflow <- xgb_workflow %>%
      finalize_workflow(best_xgb)
    

    Construct the explainer with the fitted workflow.

    # Fit the final workflow to the training data
    final_fit <- final_workflow %>% fit(data = train_data)
    
    prepped_recipe <- prep(recipe_obj, training = train_data)
    
    # Build a DALEX explainer
    explainer <- DALEX::explain(
      model = final_fit,
      data = train_data,
      y = train_data$mpg,
      label = "XGBoost"
    )
    #> Preparation of a new explainer is initiated
    #>   -> model label       :  XGBoost 
    #>   -> data              :  25  rows  11  cols 
    #>   -> target variable   :  25  values 
    #>   -> predict function  :  yhat.workflow  will be used (  default  )
    #>   -> predicted values  :  No value for predict function target column. (  default  )
    #>   -> model_info        :  package tidymodels , ver. 1.2.0 , task regression (  default  ) 
    #>   -> predicted values  :  numerical, min =  10.40073 , mean =  20.76 , max =  33.89691  
    #>   -> residual function :  difference between y and yhat (  default  )
    #>   -> residuals         :  numerical, min =  -0.001708603 , mean =  -1.64032e-06 , max =  0.003088379  
    #>   A new explainer has been created!
    
    # Choose a specific instance for local explanation
    instance_index <- 1
    instance <- test_data[instance_index, , drop = FALSE]
    
    # Ensure the instance is preprocessed using the recipe
    baked_instance <- bake(prepped_recipe, new_data = instance)
    

    Add DALEXtra functions to environment and create the surrogate model

    model_type.dalex_explainer <- DALEXtra::model_type.dalex_explainer
    predict_model.dalex_explainer <- DALEXtra::predict_model.dalex_explainer
    
    # Create a surrogate decision tree model using LIME
    surrogate_model <- predict_surrogate(
      explainer = explainer,
      new_observation = baked_instance,
      n_features = 3, 
      n_permutations = 1000,
      type = 'lime'
    )
    #> Warning: vs does not contain enough variance to use quantile binning. Using
    #> standard binning instead.
    #> Warning: am does not contain enough variance to use quantile binning. Using
    #> standard binning instead.
    
    # Plot the surrogate model
    plot(surrogate_model)
    

    Created on 2024-06-05 with reprex v2.1.0