Using Yardstick to calculate RMSE for aggregate of predictions per group

Sometimes I don't want to assess my models on their performance on predicting single observations, but rather I want to assess how a model performs for predictions in aggregate for groups. The group resampling tools in rsample, like group_vfold_cv, are great for ensuring all data splitting keeps groups together. But I want to assess models on group performance rather than performance for single observations.

For example, maybe I want to use a model that predicts induvial housing prices, but I'm ultimately going to use the model to estimate how much a neighborhood is worth.
Using the Ames dataset as an example. We can build models to predict house's sale price. But instead of tuning the model base on the model performance for predicting individual houses, I want to tune the model on its performance in predicting the sum of housing prices for a neighborhood. (I'm imagining that the Ames dataset is "complete" for each neighborhood.)

I have provided a sample code below. And for speed reasons, I kept the resampling and grid minimal.

#Load in data and transform Neighborhood variable a little
library(tidymodels)
df <- ames
df <- recipe(Sale_Price ~ ., data = df) %>% 
  step_other(Neighborhood, threshold = .04) %>% 
  prep() %>% 
  bake(new_data = df)

#Split data based off nieghborhoods
set.seed(1)
df_splits <- group_initial_split(df, group = Neighborhood)
df_train <- training(df_splits)
df_test <- testing(df_splits)
set.seed(2)
df_folds <- group_vfold_cv(df_train, group = Neighborhood, v = 5, repeats = 1)

#Simple recipe for modeling Sale_Price
rec <- recipe(Sale_Price ~ Lot_Area + Year_Built + Gr_Liv_Area, data = df_train)

#Setting up specification for MARS and RF
mars_earth_spec <-
  mars(prod_degree = tune()) %>%
  set_engine('earth') %>%
  set_mode('regression')
rand_forest_ranger_spec <-
  rand_forest(mtry = tune(), min_n = tune()) %>%
  set_engine('ranger') %>%
  set_mode('regression')

#Setting up the workflow that pairs our recipe with models
no_pre_proc <- 
  workflow_set(
    preproc = list(simple = rec), 
    models = list(MARS = mars_earth_spec, RF = rand_forest_ranger_spec)
  )

#Tune the models
grid_ctrl <-
  control_grid(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )
grid_results <-
  no_pre_proc %>%
  workflow_map(
    seed = 1503,
    resamples = df_folds,
    grid = 5,
    control = grid_ctrl
  )

#Ranking the models by RMSE for models based off their performance estimating individual houses
grid_results %>% 
  rank_results() %>% 
  filter(.metric == "rmse") %>% 
  select(model, .config, rmse = mean, rank)
#This is not what I want
#I want to rank the models by RMSE of aggregate predictions per neighborhood against the aggregate sale price
#Maybe I need something like... Truth = sum(Sale_Price, by = Neighborhood), estimate = sum(.pred, by Nieghborhood)

I can assess model's RMSE for individual houses, but I want to assess model's RMSE for neighborhood worth.

Solution

There isn't built-in support for that goal, but you should be able to do it manually.

Since we have save_pred = TRUE in control_grid(), we can get all of those predictions using collect_predictions() with summarize = FALSE.

Then a series of {dplyr} functions and rmse() which can be applied to grouped data.frames should give you what you want.

#Load in data and transform Neighborhood variable a little
library(tidymodels)
df <- ames
df <- recipe(Sale_Price ~ ., data = df) %>% 
  step_other(Neighborhood, threshold = .04) %>% 
  prep() %>% 
  bake(new_data = df)

#Split data based off nieghborhoods
set.seed(1)
df_splits <- group_initial_split(df, group = Neighborhood)
df_train <- training(df_splits)
df_test <- testing(df_splits)
set.seed(2)
df_folds <- group_vfold_cv(df_train, group = Neighborhood, v = 5, repeats = 1)

#Simple recipe for modeling Sale_Price
rec <- recipe(Sale_Price ~ Lot_Area + Year_Built + Gr_Liv_Area, data = df_train)

#Setting up specification for MARS and RF
mars_earth_spec <-
  mars(prod_degree = tune()) %>%
  set_engine('earth') %>%
  set_mode('regression')
rand_forest_ranger_spec <-
  rand_forest(mtry = tune(), min_n = tune()) %>%
  set_engine('ranger') %>%
  set_mode('regression')

#Setting up the workflow that pairs our recipe with models
no_pre_proc <- 
  workflow_set(
    preproc = list(simple = rec), 
    models = list(MARS = mars_earth_spec, RF = rand_forest_ranger_spec)
  )

#Tune the models
grid_ctrl <-
  control_grid(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )
grid_results <-
  no_pre_proc %>%
  workflow_map(
    seed = 1503,
    resamples = df_folds,
    grid = 5,
    control = grid_ctrl
  )
#> i Creating pre-processing data to finalize unknown parameter: mtry

grid_results %>%
  collect_predictions(summarize = FALSE) %>%
  mutate(Neighborhood = df_train$Neighborhood[.row]) %>%
  group_by(id, model, .config, Neighborhood) %>%
  summarise(Sale_Price = sum(Sale_Price), .pred = sum(.pred), .groups = "drop") %>%
  group_by(id, model, .config) %>%
  rmse(truth = Sale_Price, estimate = .pred) %>%
  group_by(model, .config) %>%
  summarize(mean_rmse = mean(.estimate), .groups = "drop") %>%
  arrange(mean_rmse)
#> # A tibble: 7 × 3
#>   model       .config              mean_rmse
#>   <chr>       <chr>                    <dbl>
#> 1 rand_forest Preprocessor1_Model1  2667177.
#> 2 mars        Preprocessor1_Model2  2695526.
#> 3 rand_forest Preprocessor1_Model4  2819628.
#> 4 rand_forest Preprocessor1_Model5  2824109.
#> 5 rand_forest Preprocessor1_Model3  2845252.
#> 6 rand_forest Preprocessor1_Model2  3059321.
#> 7 mars        Preprocessor1_Model1  3563432.