I'm using several steps of data pre-processing in below ml workflow to use the final model in the production via vetiver api.
The workflow and prediction work well in my local environment, however I get an error when trying to run an example in vetiver api. When keep_original_cols = FALSE in step_dummy the error message says that original columns are not present in the data . I tried keep_original_cols = TRUE, but in that case the error message output relates to format of the original columns (see example below).
Below is example from https://juliasilge.com/blog/sf-trees-random-tuning/
sf_trees <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-28/sf_trees.csv")
trees_df <- sf_trees %>%
mutate(
legal_status = case_when(
legal_status == "DPW Maintained" ~ legal_status,
TRUE ~ "Other"
),
plot_size = parse_number(plot_size)
) %>%
select(-address) %>%
na.omit() %>%
mutate_if(is.character, factor)
set.seed(123)
trees_split <- initial_split(trees_df, strata = legal_status)
trees_train <- training(trees_split)
trees_test <- testing(trees_split)
tree_rec <- recipe(legal_status ~ ., data = trees_train) %>%
update_role(tree_id, new_role = "ID") %>%
step_other(species, caretaker, threshold = 0.01) %>%
step_other(site_info, threshold = 0.005) %>%
step_dummy(all_nominal(), -all_outcomes(), keep_original_cols = T) %>%
step_date(date, features = c("month","year")) %>%
step_downsample(legal_status)
tree_prep <- prep(tree_rec)
juiced <- juice(tree_prep)
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_mode("classification") %>%
set_engine("ranger")
tune_wf <- workflow() %>%
add_recipe(tree_rec) %>%
add_model(tune_spec)
set.seed(234)
trees_folds <- vfold_cv(trees_train)
doParallel::registerDoParallel()
set.seed(345)
tune_res <- tune_grid(
tune_wf,
resamples = trees_folds,
grid = 20
)
rf_grid <- grid_regular(
mtry(range = c(10, 30)),
min_n(range = c(2, 8)),
levels = 5
)
set.seed(456)
regular_res <- tune_grid(
tune_wf,
resamples = trees_folds,
grid = rf_grid
)
best_auc <- select_best(regular_res, "roc_auc")
final_rf <- finalize_model(
tune_spec,
best_auc
)
final_rf
final_wf <- workflow() %>%
add_recipe(tree_rec) %>%
add_model(final_rf)
final_res <- final_wf %>%
fit(data=trees_train)
# Creating board
model_board <- board_temp()
# Model versioning
rf_ptype <- extract_recipe(final_res) %>%
bake(new_data = trees_df, -all_outcomes()) %>%
vctrs::vec_ptype()
RF_mod <- vetiver_model(final_res, "RF_fit", save_ptype = rf_ptype)
model_board %>% vetiver_pin_write(RF_mod)
model_board %>% pin_meta(RF_mod)
Creating model API
pr() %>% vetiver_api(RF_mod) %>%
pr_run(port = 8088)
endpoint <- vetiver_endpoint("https://127.0.0.1:8088/predict")
endpoint
The example input is :
[
{
"tree_id": 53227,
"species": "Celtis sinensis :: Chinese Hackberry",
"site_order": 6,
"site_info": "Sidewalk: Curb side : Cutout",
"caretaker": "SFUSD",
"date": "2005-03-12",
"dbh": 3,
"plot_size": 3,
"latitude": 37.73225,
"longitude": -122.3934,
"species_Eriobotrya.deflexa....Bronze.Loquat": 0,
"species_Ginkgo.biloba....Maidenhair.Tree": 0,
"species_Lagunaria.patersonii....Primrose.Tree": 0,
"species_Lophostemon.confertus....Brisbane.Box": 0,
"species_Magnolia.grandiflora..Little.Gem.....Little.Gem.Magnolia": 0,
"species_Magnolia.grandiflora..Samuel.Sommer.....Samuel.Sommer.Magnolia": 0.5,
"species_Magnolia.grandiflora....Southern.Magnolia": 0,
"species_Maytenus.boaria....Mayten": 0,
"species_Melaleuca.quinquenervia....Cajeput": 0,
"species_Metrosideros.excelsa....New.Zealand.Xmas.Tree": 0,
"species_Olea.europaea....Olive.Tree": 0,
"species_Pittosporum.undulatum....Victorian.Box": 0,
"species_Platanus.x.hispanica....Sycamore..London.Plane": 0,
"species_Prunus.cerasifera....Cherry.Plum": 0,
"species_Prunus.serrulata..Kwanzan.....Kwanzan.Flowering.Cherry": 0,
"species_Prunus.serrulata....Ornamental.Cherry": 0,
"species_Pyrus.calleryana....Ornamental.Pear": 0,
"species_Pyrus.kawakamii....Evergreen.Pear": 0,
"species_Tree.s....": 0,
"species_Tristania.conferta...": 0,
"species_Tristaniopsis.laurina..Elegant.....Small.leaf.Tristania..Elegant.": 0,
"species_Tristaniopsis.laurina....Swamp.Myrtle": 0,
"species_Ulmus.parvifolia....Chinese.Elm": 0,
"species_Washingtonia.robusta....Mexican.Fan.Palm": 0,
"species_other": 0,
"site_info_Sidewalk..Curb.side...Cutout": 1,
"site_info_Sidewalk..Curb.side...Yard": 0,
"site_info_Sidewalk..Property.side...Cutout": 0,
"site_info_other": 0,
"caretaker_Port": 0,
"caretaker_Private": 0,
"caretaker_SFUSD": 1,
"caretaker_other": 0,
"date_month": "Mar",
"date_year": 2005
}
]
The error message I get:
{
"error": "500 - Internal server error",
"message": "\u001b[1m\u001b[33mError\u001b[39m in `warn_to_error()`:\u001b[22m\n\u001b[33m!\u001b[39m [0, 2]: expected value in level set, but got 'Celtis sinensis :: Chinese Hackberry'\n"
}
I've also tried manually change the input in vetiver example to have right formatting but got:
{
"error": "500 - Internal server error",
"message": "\u001b[1m\u001b[33mError\u001b[39m in `warn_to_error()`:\u001b[22m\n\u001b[33m!\u001b[39m [0, 2]: expected value in level set, but got '1'\n"
}
I think the important thing to learn here is that when you deploy a tidymodels workflow, you deploy both the preprocessing recipe and the random forest estimator together. This means that the input data prototype involves the variables in its initial formatting, not the intermediate formatting after the recipe is applied. Take a look at implementing a model API in this way:
## train a model:
library(tidymodels)
library(tidyverse)
library(themis)
sf_trees <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-28/sf_trees.csv")
#> Rows: 192987 Columns: 12
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): legal_status, species, address, site_info, caretaker, plot_size
#> dbl (5): tree_id, site_order, dbh, latitude, longitude
#> date (1): date
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trees_df <- sf_trees %>%
mutate(
legal_status = case_when(
legal_status == "DPW Maintained" ~ legal_status,
TRUE ~ "Other"
),
plot_size = parse_number(plot_size)
) %>%
select(-address) %>%
na.omit() %>%
mutate_if(is.character, factor)
#> Warning: 109 parsing failures.
#> row col expected actual
#> 10979 -- a number TR
#> 13245 -- a number CUT
#> 13495 -- a number TR
#> 13501 -- a number TR
#> 13502 -- a number TR
#> ..... ... ........ ......
#> See problems(...) for more details.
set.seed(123)
trees_split <- initial_split(trees_df, strata = legal_status)
trees_train <- training(trees_split)
trees_test <- testing(trees_split)
tree_rec <- recipe(legal_status ~ ., data = trees_train) %>%
update_role(tree_id, new_role = "ID") %>%
step_other(species, caretaker, threshold = 0.01) %>%
step_other(site_info, threshold = 0.005) %>%
step_dummy(all_nominal(), -all_outcomes(), keep_original_cols = T) %>%
step_date(date, features = c("month","year")) %>%
step_downsample(legal_status)
rf_spec <- rand_forest(trees = 1e3, mode = "classification")
final_fit <- workflow(tree_rec, rf_spec) %>% fit(data = trees_train)
Now we can build the vetiver deployable model object, and version it. Notice that we don't need to provide our own input data prototype; it can be read automatically off the tidymodels workflow:
## version the model:
library(pins)
library(vetiver)
#>
#> Attaching package: 'vetiver'
#>
#> The following object is masked from 'package:tune':
#>
#> load_pkgs
model_board <- board_temp()
v <- vetiver_model(final_fit, "RF-fit")
model_board %>% vetiver_pin_write(v)
#> Creating new version '20221212T182406Z-cc5f5'
#> Writing to pin 'RF-fit'
#>
#> Create a Model Card for your published model
#> • Model Cards provide a framework for transparent, responsible reporting
#> • Use the vetiver `.Rmd` template as a place to start
model_board %>% pin_meta("RF-fit")
#> List of 12
#> $ file : chr "RF-fit.rds"
#> $ file_size : 'fs_bytes' int 11.8M
#> $ pin_hash : chr "cc5f56b86e7184ec"
#> $ type : chr "rds"
#> $ title : chr "RF-fit: a pinned list"
#> $ description: chr "A ranger classification modeling workflow"
#> $ tags : NULL
#> $ created : POSIXct[1:1], format: "2022-12-12 11:24:06"
#> $ api_version: num 1
#> $ user : list()
#> $ name : chr "RF-fit"
#> $ local :List of 3
#> ..$ dir : 'fs_path' chr "/var/folders/hv/hzsmmyk9393_m7q3nscx1slc0000gn/T/RtmpXM1fFS/pins-ebb03322c3e4/RF-fit/20221212T182406Z-cc5f5"
#> ..$ url : NULL
#> ..$ version: chr "20221212T182406Z-cc5f5"
And then we can use that to set up an API:
## set up model API:
library(plumber)
pr() %>%
vetiver_api(v) ## next pipe to pr_run(port = 8088)
#> # Plumber router with 2 endpoints, 4 filters, and 1 sub-router.
#> # Use `pr_run()` on this object to start the API.
#> ├──[queryString]
#> ├──[body]
#> ├──[cookieParser]
#> ├──[sharedSecret]
#> ├──/logo
#> │ │ # Plumber static router serving from directory: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/vetiver
#> ├──/ping (GET)
#> └──/predict (POST)
Created on 2022-12-12 with reprex v2.0.2
Here is what the interactive visual documentation for the API looks like:
Notice that the species
variable is still in its string (factor in R) formatting, not dummy variables.
If for some reason, you do need to provide a custom input data prototype, you would use vctrs::vec_ptype(trees_train)
, because that is the input data for the workflow (not the output of the recipe).