Problem when trying to produce shap values for classification problem using tidymodels.
hen i try to calculate shap values after training my model in tidymodels following steps on this site https://github.com/ModelOriented/kernelshap i cant reporduce it for classification problem. My target varialbe needs to be factor. It always returns:
#Error in check_pred(pred_fun(object, X, ...), n = n) : Predictions must be numeric after running function kernelshap
I found way arround by using matrix instead of data frames and extracting model with extract fit parsnip. But question remains, is there a way to reproduce the example, but for classification. Example code bellow
library(tidyverse)
library(tidymodels)
Default <- ISLR::Default
Default = Default %>%
mutate(
default = factor(case_when(
default == "Yes" ~ 1,
default == "No" ~ 0
), levels = c(1,0)
)
)# changing to factor otherwise model will not work
# model fitting
split <- initial_split(Default)
tr <- training(split)
te <- testing(split)
rec <- recipe(default~., data = Default) %>%
step_dummy(all_nominal_predictors())
spec <- boost_tree() %>%
set_mode("classification") %>%
set_engine("xgboost")
wf <- workflow() %>%
add_model(spec) %>%
add_recipe(rec)
# model fit
mod <- fit(wf,tr)
library(kernelshap)
x <- rec %>%
prep %>%
bake(te %>%
slice_sample(n = 50)) %>%
select(-default) %>%
as.data.frame()
bg <- rec %>% prep %>%
bake(te %>% slice_sample(n = 10)) %>%
mutate(default = as.numeric(as.character(default))) %>%
as.data.frame()
# test for prediction
predict(mod, te)
# extract model form tidymodels
md <- extract_fit_parsnip(mod)
# this version works
kernelshap(md$fit,
X = x %>% as.matrix(), # if i do it with matrix structure then it works
bg_X =bg %>% as.matrix()
)
# this version does not work
kernelshap(mod,
X = te %>%
select(-default) %>% # remove target var
slice_sample(n = 50) %>%
as.data.frame(),
bg_X = te %>%
slice_sample(n = 50) %>%
as.data.frame()
)
################################### error messgae:
#Error in check_pred(pred_fun(object, X, ...), n = n) :
# Predictions must be numeric
######################################
kernelshap(mod,
X = te %>%
select(-default) %>% # remove target var
slice_sample(n = 50) %>%
as.data.frame(),
bg_X = te %>%
slice_sample(n = 50) %>%
as.data.frame()
)
# toy example from github page using tidymodels
library(tidymodels)
library(kernelshap)
iris_recipe <- iris %>%
recipe(Sepal.Length ~ .)
reg <- linear_reg() %>%
set_engine("lm")
iris_wf <- workflow() %>%
add_recipe(iris_recipe) %>%
add_model(reg)
fit <- iris_wf %>%
fit(iris)
ks <- kernelshap(fit, iris[, -1], bg_X = iris)
ks
{kernelshap} is intended to work quite well with Tidymodels. In your case, you can simply write:
library(kernelshap)
library(shapviz)
x <- c("student", "balance", "income")
ks <- kernelshap(
mod,
X = head(Default, 1000), # Assuming random row order
bg_X = head(Default, 200), # Assuming random row order
type = "prob", # Predictions must be numeric
feature_names = x # Or use X = head(Default[x], 1000)
)
sv <- shapviz(ks) # Contains one shapviz object per class
sv_dependence(sv$.pred_1, v = x)
sv_importance(sv$.pred_1, kind = "bee", show_numbers = TRUE)
sv_importance(sv$.pred_1)
library(doFuture)
options(doFuture.rng.onMisuse = "ignore") # To suppress some warning on random seeds
# Set up parallel backend
registerDoFuture()
plan(multisession, workers = 4) # Windows
# plan(multicore, workers = 4) # Linux, macOS, Solaris
x <- c("student", "balance", "income")
ks <- kernelshap(
mod,
X = head(Default, 1000),
bg_X = head(Default, 200),
type = "prob",
feature_names = x,
parallel = TRUE,
parallel_args = list(.packages = "tidymodels")
)
The last argument is necessary because stats::predict()
masks the source of the underlying prediction function.