I'm trying to fit an XGBoost
model using tidymodels
and I keep receiving the following error:
→ A | error: `terms` must be a <terms>, not a double vector.
There were issues with some computations A: x5
Warning message:
All models failed. Run `show_notes(.Last.tune.result)` for more information.
I downloaded a dataset from Kaggle. The following chunk loads the data, recodes the dependent variable in a binary factor, and transfroms predictors into numeric and factors only.
#load data
dat <- read_csv(file = "loanPayments.csv") %>%
mutate(
#compute DV
paid = as.numeric(is.na(past_due_days)),
#clean education
education = gsub(pattern = "Above|Below| or ", replacement = "", x = education),
education = gsub(pattern = " ", replacement = "", x = education),
education = gsub(pattern = "Bechalor", replacement = "bachelor", x = education),
education = tolower(education),
#convert characters to factors
education = as.factor(education),
Gender = as.factor(Gender),
# dv to factor
paid = as.factor(paid)
) %>%
dplyr::select(paid, Principal, terms, age, education, Gender)
#rename columns as lower case
colnames(dat) <- tolower(colnames(dat))
Then, I set up the recpie
and k-fold
cross validation
after performing the initial_split
.
#train test split
initial_split <- initial_split(
data = dat, #dataset
prop = 0.8, #train/test proportion
strata = paid) #variable to stratify (DV)
#preprocessing
preprocessing_recipe <- recipe(
paid ~ ., #state formula
data = training(initial_split)) %>%
# create dummy variable
step_dummy(all_nominal()) %>%
# centering and scaling continuous variables
step_normalize(all_numeric_predictors()) %>%
# remove no variance predictors
step_nzv(all_numeric_predictors()) %>%
#
prep()
#cross validation
cv_folds <- bake(
preprocessing_recipe, #preprocessing
new_data = training(initial_split) #data
) %>%
vfold_cv(v = 5) #folds
Here, I prepare the model specification and the tidymodels
workflow
to tune some of the hyperparameters.
#define model
xgbSpec <- parsnip::boost_tree(
trees = 50, min_n = tune(),
tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune()) %>%
set_engine(engine = "xgboost",
objective = "multi:softprob",
num_class = 2) %>%
set_mode(mode = "classification")
#set parameters grid
xgboostGrid <- grid_latin_hypercube(
min_n(), tree_depth(),
learn_rate(), loss_reduction(),
size = 10) #how many hyperparam combinations?
#create workflow
xgboost_wf <- workflows::workflow() %>%
add_formula(paid ~ .) %>%
add_model(xgbSpec) #model
#tune parameters
xgboostFitted <- tune::tune_grid(
object = xgboost_wf, #workflow
resamples = cv_folds, #cross-validation
grid = xgboostGrid, #grid search
metrics = yardstick::metric_set(accuracy),
control = control_grid(save_pred = TRUE))
After running the code, the tune produces the error below:
→ A | error: `terms` must be a <terms>, not a double vector.
There were issues with some computations A: x5
Warning message:
All models failed. Run `show_notes(.Last.tune.result)` for more information.
Here is the output of dput()
on my data.
structure(list(paid = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("0",
"1"), class = "factor"), principal = c(1000, 1000, 1000, 1000,
1000, 300, 1000, 1000, 1000, 800, 300, 1000, 1000, 900, 1000,
800, 1000, 1000, 1000, 800, 1000, 1000, 1000, 1000, 1000, 1000,
800, 1000, 1000, 1000, 800, 1000, 1000, 1000, 800, 800, 1000,
700, 1000, 1000, 1000, 800, 1000, 1000, 1000, 800, 1000, 1000,
1000, 800, 800, 1000, 800, 1000, 1000, 1000, 1000, 1000, 800,
800, 1000, 1000, 1000, 1000, 800, 900, 1000, 1000, 300, 1000,
800, 800, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
1000, 1000, 800, 1000, 1000, 1000, 1000, 800, 800, 800, 1000,
800, 800, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 800, 1000,
1000, 1000, 1000, 1000, 1000, 1000, 1000, 800, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 1000, 800, 1000, 1000, 1000, 1000, 1000,
1000, 1000, 800, 1000, 1000, 1000, 800, 1000, 1000, 800, 1000,
800, 1000, 1000, 1000, 1000, 800, 1000, 800, 1000, 1000, 1000,
300, 1000, 800, 1000, 800, 500, 1000, 1000, 1000, 1000, 800,
1000, 1000, 1000, 800, 1000, 1000, 1000, 1000, 1000, 800, 1000,
1000, 1000, 1000, 800, 1000, 800, 800, 1000, 1000, 1000, 1000,
1000, 300, 1000, 800, 1000, 1000, 1000, 1000, 800, 800, 1000,
1000, 1000, 1000, 800, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
1000, 800, 1000, 1000, 1000, 1000, 300, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 800, 1000, 1000, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 800, 800, 800, 1000, 1000, 1000, 1000, 800,
1000, 1000, 800, 1000, 1000, 1000, 1000, 800, 800, 1000, 1000,
800, 800, 1000, 1000, 800, 1000, 1000, 500, 1000, 1000, 800,
1000, 1000, 1000, 800, 1000, 800, 1000, 800, 1000, 800, 1000,
1000, 800, 1000, 1000, 1000, 1000, 1000, 800, 800, 1000, 800,
800, 800, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 800,
1000, 1000, 1000, 1000, 1000, 1000, 800, 1000, 1000, 1000, 1000,
1000, 1000, 800, 800, 1000, 800, 1000, 1000, 800, 1000, 800,
1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
800, 1000, 1000, 800, 1000, 1000, 1000, 1000, 800, 1000, 1000,
1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 800,
1000, 1000, 800, 1000, 800, 1000, 1000, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 1000, 800, 1000, 1000, 800, 1000, 800,
800, 1000, 1000, 800, 1000, 800, 1000, 1000, 800, 1000, 1000,
1000, 1000, 1000, 800, 1000, 1000, 1000, 1000, 800, 1000, 1000,
1000, 1000, 800, 1000, 1000, 800, 1000, 1000, 800, 1000, 1000,
1000, 1000, 1000, 1000, 800, 1000, 800, 1000, 1000, 1000, 800,
1000, 800, 1000, 1000, 800, 1000, 800, 800, 1000, 1000, 1000,
800, 1000, 1000, 1000, 1000, 1000, 1000, 500, 800, 1000, 800,
1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 1000, 1000, 1000, 800, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 800, 800, 1000, 1000, 1000, 1000, 1000,
1000, 1000, 1000, 1000, 1000, 1000, 800, 1000, 800, 1000, 800,
1000, 1000, 1000, 1000, 1000, 800, 1000, 1000, 1000, 800, 1000,
1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 800, 1000,
1000), terms = c(30, 30, 30, 15, 30, 7, 30, 30, 30, 15, 7, 15,
30, 7, 7, 15, 30, 15, 30, 30, 30, 30, 30, 15, 30, 30, 15, 15,
30, 30, 15, 30, 30, 30, 15, 15, 30, 15, 15, 30, 15, 15, 7, 15,
15, 7, 30, 30, 30, 15, 15, 30, 15, 30, 30, 15, 30, 30, 15, 15,
30, 15, 30, 30, 15, 15, 30, 30, 7, 30, 15, 15, 30, 15, 15, 30,
30, 30, 30, 30, 30, 7, 15, 15, 30, 30, 30, 30, 15, 15, 15, 7,
15, 15, 30, 30, 15, 30, 30, 30, 7, 15, 30, 15, 15, 30, 30, 30,
30, 7, 15, 30, 30, 30, 30, 15, 30, 30, 30, 15, 30, 30, 15, 15,
30, 30, 15, 15, 15, 30, 15, 15, 30, 30, 15, 30, 15, 15, 15, 30,
30, 15, 7, 15, 30, 15, 30, 7, 15, 15, 15, 15, 15, 15, 30, 30,
15, 15, 15, 30, 15, 15, 30, 15, 15, 30, 30, 30, 15, 30, 15, 30,
15, 30, 15, 15, 30, 30, 30, 30, 15, 7, 30, 15, 30, 30, 30, 15,
15, 15, 30, 30, 15, 30, 15, 30, 30, 15, 30, 30, 15, 30, 15, 15,
30, 7, 30, 30, 7, 30, 15, 30, 30, 30, 30, 30, 15, 15, 15, 30,
15, 30, 30, 15, 30, 30, 15, 15, 15, 15, 15, 7, 30, 30, 15, 30,
30, 15, 15, 15, 30, 15, 15, 15, 7, 30, 15, 15, 30, 15, 15, 30,
30, 7, 30, 30, 15, 15, 7, 30, 15, 15, 30, 15, 15, 15, 15, 15,
30, 30, 30, 30, 30, 30, 30, 15, 15, 30, 15, 30, 15, 15, 30, 30,
15, 30, 30, 30, 30, 15, 15, 30, 30, 30, 30, 30, 15, 30, 30, 30,
15, 30, 30, 15, 15, 15, 15, 30, 30, 15, 30, 15, 30, 30, 30, 15,
30, 15, 30, 15, 30, 30, 30, 15, 15, 30, 15, 30, 30, 30, 15, 15,
30, 30, 30, 30, 30, 30, 15, 15, 30, 30, 15, 30, 15, 30, 30, 15,
30, 15, 30, 15, 30, 30, 30, 15, 30, 30, 30, 30, 30, 15, 30, 15,
15, 30, 30, 15, 30, 30, 15, 30, 15, 15, 30, 15, 15, 30, 30, 30,
30, 15, 15, 30, 30, 30, 15, 30, 15, 15, 15, 15, 30, 30, 15, 30,
30, 15, 30, 30, 30, 15, 30, 15, 15, 30, 15, 30, 30, 15, 15, 30,
15, 15, 30, 15, 30, 15, 15, 30, 15, 30, 15, 15, 30, 30, 30, 30,
30, 15, 15, 15, 7, 30, 30, 30, 30, 30, 30, 30, 30, 15, 30, 30,
30, 15, 30, 30, 30, 30, 30, 15, 15, 30, 30, 15, 30, 30, 30, 15,
15, 30, 30, 15, 30, 30, 30, 15, 30, 30, 15, 30, 15, 30, 15, 30,
15, 30, 15, 30, 15, 15, 15, 30, 30, 30, 15, 30, 15, 30, 30, 30,
30, 15, 30, 30, 15, 15, 30, 30), age = c(45, 50, 33, 27, 28,
35, 29, 36, 28, 26, 29, 39, 26, 26, 27, 26, 40, 32, 32, 26, 26,
43, 25, 26, 26, 29, 39, 34, 31, 33, 33, 37, 27, 37, 33, 29, 27,
33, 24, 21, 32, 30, 31, 30, 24, 35, 22, 32, 32, 50, 27, 35, 35,
34, 21, 25, 27, 26, 44, 39, 34, 37, 34, 45, 24, 28, 28, 37, 35,
43, 29, 29, 33, 34, 25, 30, 31, 35, 37, 44, 28, 25, 29, 33, 37,
33, 24, 27, 43, 46, 34, 32, 38, 27, 33, 36, 26, 34, 22, 31, 29,
38, 30, 45, 35, 30, 31, 31, 28, 29, 29, 27, 27, 33, 28, 25, 40,
23, 35, 24, 34, 22, 20, 23, 33, 26, 28, 43, 34, 38, 26, 43, 26,
33, 24, 30, 32, 22, 47, 20, 28, 35, 27, 33, 30, 31, 26, 37, 26,
35, 29, 23, 23, 30, 34, 36, 26, 29, 28, 27, 24, 31, 28, 27, 25,
24, 28, 28, 35, 38, 38, 29, 35, 24, 39, 25, 38, 30, 21, 46, 31,
29, 35, 30, 27, 31, 33, 34, 28, 42, 32, 30, 25, 27, 21, 24, 29,
40, 29, 29, 30, 26, 36, 27, 20, 26, 26, 27, 23, 39, 27, 30, 33,
27, 35, 29, 50, 31, 31, 29, 35, 39, 29, 30, 33, 26, 25, 37, 26,
26, 27, 34, 37, 36, 33, 30, 30, 36, 29, 36, 32, 29, 36, 30, 31,
19, 26, 34, 35, 35, 38, 29, 28, 22, 32, 31, 28, 37, 25, 19, 51,
29, 23, 30, 23, 34, 31, 24, 42, 40, 29, 32, 28, 35, 30, 44, 37,
31, 36, 31, 42, 28, 30, 30, 24, 34, 29, 38, 34, 28, 30, 41, 29,
37, 36, 30, 27, 29, 40, 28, 29, 37, 33, 27, 24, 31, 28, 40, 33,
41, 30, 26, 27, 20, 24, 26, 30, 29, 22, 24, 25, 28, 37, 32, 34,
28, 35, 27, 24, 44, 31, 27, 21, 30, 38, 34, 31, 23, 27, 39, 30,
25, 50, 23, 38, 27, 31, 40, 32, 29, 26, 25, 35, 41, 37, 34, 45,
26, 32, 28, 34, 29, 26, 26, 22, 27, 33, 28, 24, 37, 36, 18, 25,
40, 29, 26, 30, 33, 30, 32, 25, 35, 30, 26, 29, 26, 46, 36, 38,
32, 30, 35, 29, 26, 32, 25, 33, 39, 28, 26, 26, 28, 39, 29, 33,
27, 34, 26, 28, 32, 27, 21, 39, 38, 36, 33, 21, 25, 29, 33, 47,
33, 23, 24, 27, 32, 33, 27, 35, 37, 28, 33, 34, 29, 34, 29, 24,
34, 25, 24, 30, 28, 24, 26, 24, 29, 31, 26, 25, 29, 38, 41, 26,
26, 35, 37, 25, 24, 34, 33, 38, 38, 26, 37, 42, 49, 26, 41, 38,
26, 32, 27, 33, 30, 26, 35, 46, 27, 22, 27, 30, 27, 47, 30, 26,
38, 46, 35, 45, 36, 38, 27, 27, 29, 30, 28, 26, 30, 38, 28),
education = structure(c(3L, 1L, 1L, 2L, 2L, 4L, 2L, 2L, 2L,
2L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 2L,
2L, 3L, 1L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 1L, 3L, 3L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 3L, 3L, 2L, 1L, 3L, 2L, 1L, 1L, 3L,
3L, 2L, 3L, 1L, 3L, 4L, 1L, 2L, 3L, 2L, 3L, 2L, 1L, 3L, 2L,
1L, 2L, 3L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 2L,
3L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 3L, 3L, 2L, 3L, 2L, 3L,
1L, 3L, 2L, 2L, 3L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 2L, 3L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 1L,
1L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 2L,
3L, 2L, 2L, 4L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 1L, 3L, 2L,
3L, 3L, 1L, 3L, 2L, 3L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 2L,
3L, 2L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 2L,
2L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 3L, 2L,
2L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L,
3L, 2L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 1L, 3L, 2L, 3L, 2L, 3L, 3L,
1L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 3L,
2L, 1L, 1L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 1L, 2L, 3L, 2L, 2L,
2L, 3L, 1L, 2L, 3L, 1L, 1L, 2L, 3L, 1L, 3L, 2L, 3L, 2L, 3L,
1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 2L, 1L, 3L, 2L, 3L, 2L,
2L, 2L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 3L, 2L, 2L,
2L, 1L, 1L, 2L, 3L, 1L, 2L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 2L,
3L, 3L, 2L, 4L, 3L, 1L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 3L,
3L, 2L, 3L, 1L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 1L, 2L,
3L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 1L, 3L,
3L, 3L, 3L, 3L, 1L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 3L, 2L, 2L, 3L, 3L, 1L, 2L, 2L, 2L, 2L,
3L, 2L, 2L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 1L, 2L, 2L, 3L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 2L,
3L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 1L, 3L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 3L,
3L, 2L, 2L, 2L, 3L, 1L, 1L, 3L, 2L, 2L, 2L, 3L, 3L, 1L, 2L,
2L, 3L, 2L, 1L, 2L, 3L, 3L, 3L, 2L, 2L, 3L), levels = c("bachelor",
"college", "highschool", "master"), class = "factor"), gender = structure(c(2L,
1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L), levels = c("female", "male"), class = "factor")), row.names = c(NA,
-500L), class = c("tbl_df", "tbl", "data.frame"))
I tried following different tutorials but nothing worked with my code/data. https://www.r-bloggers.com/2020/05/using-xgboost-with-tidymodels/ https://www.r-bloggers.com/2020/05/tidymodels-and-xgbooost-a-few-learnings/ https://www.kirenz.com/blog/posts/2021-02-17-r-classification-tidymodels/index.html https://juliasilge.com/blog/xgboost-tune-volleyball/ https://juliasilge.com/blog/austin-housing/
Does this help?
step_dummy()
adjusted to only the predictors. prep()
and bake()
help you see what the pre-processing does to the data, but the workflow handles this for you.
library(tidymodels)
initial_split <- initial_split(df, prop = 0.8, strata = paid)
train <- training(initial_split)
preprocessing_recipe <-
recipe(train, paid ~ .) %>%
step_dummy(all_nominal_predictors()) %>% ### predictors only
step_normalize(all_numeric_predictors()) %>%
step_nzv(all_numeric_predictors())
cv_folds <- vfold_cv(train, v = 5)
xgbSpec <- boost_tree(
trees = 50,
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune()
) %>%
set_engine(engine = "xgboost") %>%
set_mode(mode = "classification")
xgboostGrid <- grid_latin_hypercube(
min_n(),
tree_depth(),
learn_rate(),
loss_reduction(),
size = 10
)
xgboost_wf <- workflow() %>%
add_recipe(preprocessing_recipe) %>%
add_model(xgbSpec)
xgboostFitted <-
tune_grid(
object = xgboost_wf,
resamples = cv_folds,
grid = xgboostGrid,
metrics = metric_set(accuracy),
control = control_grid(save_pred = TRUE)
)
xgboostFitted |> autoplot()
Created on 2024-03-10 with reprex v2.1.0