rtidymodelsshap

R kernelshap package with tidymodels with classification


Problem when trying to produce shap values for classification problem using tidymodels.

hen i try to calculate shap values after training my model in tidymodels following steps on this site https://github.com/ModelOriented/kernelshap i cant reporduce it for classification problem. My target varialbe needs to be factor. It always returns:

#Error in check_pred(pred_fun(object, X, ...), n = n) : Predictions must be numeric after running function kernelshap

I found way arround by using matrix instead of data frames and extracting model with extract fit parsnip. But question remains, is there a way to reproduce the example, but for classification. Example code bellow



library(tidyverse)
library(tidymodels)


Default <- ISLR::Default


Default = Default %>%
  mutate(
    default = factor(case_when(
      default == "Yes" ~ 1,
      default == "No" ~ 0
    ), levels = c(1,0)
  ) 
)# changing to factor otherwise model will not work
  

# model fitting
split <- initial_split(Default)
  tr <- training(split)
  te <- testing(split)
  

  
  
rec <- recipe(default~., data = Default) %>% 
  step_dummy(all_nominal_predictors())




spec <- boost_tree() %>% 
  set_mode("classification") %>% 
  set_engine("xgboost")


wf <- workflow() %>% 
  add_model(spec) %>% 
  add_recipe(rec)

# model fit
mod <- fit(wf,tr)



library(kernelshap)

x <- rec %>% 
  prep %>% 
  bake(te %>% 
         slice_sample(n = 50)) %>% 
  select(-default) %>% 
  as.data.frame()
bg <- rec %>% prep %>% 
  bake(te %>% slice_sample(n = 10)) %>% 
  mutate(default = as.numeric(as.character(default))) %>% 
  as.data.frame()

# test for prediction
predict(mod, te)

# extract model form tidymodels
md <- extract_fit_parsnip(mod)

# this version works
kernelshap(md$fit, 
           X = x %>% as.matrix(), # if i do it with matrix structure then it works
           bg_X =bg %>% as.matrix()
           
           )

# this version does not work
kernelshap(mod, 
           X = te %>% 
             select(-default) %>%  # remove target var
             slice_sample(n = 50) %>% 
             as.data.frame(), 
           bg_X = te %>% 
             slice_sample(n = 50) %>% 
             as.data.frame()
           
)

################################### error messgae:

#Error in check_pred(pred_fun(object, X, ...), n = n) : 
#  Predictions must be numeric
######################################
              kernelshap(mod, 
                          X = te %>% 
                            select(-default) %>%  # remove target var
                            slice_sample(n = 50) %>% 
                            as.data.frame(), 
                          bg_X = te %>% 
                            slice_sample(n = 50) %>% 
                            as.data.frame()
              )

                          

# toy example from github page using tidymodels


library(tidymodels)
library(kernelshap)

iris_recipe <- iris %>%
  recipe(Sepal.Length ~ .)

reg <- linear_reg() %>%
  set_engine("lm")

iris_wf <- workflow() %>%
  add_recipe(iris_recipe) %>%
  add_model(reg)

fit <- iris_wf %>%
  fit(iris)

ks <- kernelshap(fit, iris[, -1], bg_X = iris)
ks

Solution

  • {kernelshap} is intended to work quite well with Tidymodels. In your case, you can simply write:

    library(kernelshap)
    library(shapviz)
    
    x <- c("student", "balance", "income")
    ks <- kernelshap(
      mod, 
      X = head(Default, 1000),    # Assuming random row order
      bg_X = head(Default, 200),  # Assuming random row order
      type = "prob",              # Predictions must be numeric
      feature_names = x           # Or use X = head(Default[x], 1000)
    )
    
    sv <- shapviz(ks)             # Contains one shapviz object per class
    sv_dependence(sv$.pred_1, v = x)
    sv_importance(sv$.pred_1, kind = "bee", show_numbers = TRUE)
    sv_importance(sv$.pred_1)
    
    

    enter image description here enter image description here enter image description here

    Comments

    Parallel use

    library(doFuture)
    
    options(doFuture.rng.onMisuse = "ignore")  # To suppress some warning on random seeds
    
    # Set up parallel backend
    registerDoFuture()
    plan(multisession, workers = 4)  # Windows
    # plan(multicore, workers = 4)   # Linux, macOS, Solaris
    
    x <- c("student", "balance", "income")
    ks <- kernelshap(
      mod, 
      X = head(Default, 1000),   
      bg_X = head(Default, 200),
      type = "prob",
      feature_names = x,
      parallel = TRUE,
      parallel_args = list(.packages = "tidymodels")
    )
    

    The last argument is necessary because stats::predict() masks the source of the underlying prediction function.