rtime-seriesconfidence-intervalensemble-learningtidymodels

How to extract confidence intervals from modeltime recursive ensembles?


As I want to produce some visualizations and analysis on forecasted data outside the modeltime framework, I need to extract confidence values, fitted values and maybe also residuals.

The documentation indicates, that I need to use the function modeltime_calibrate() to get the confidence values and residuals. So one question would be, where do I extract the fitted values from?

My main question is whatsoever, how to do calibration on recursive ensembles. For any non-ensemble model I was able to do it, but in case of recursive ensembles I encounter some error messages, if I want to calibrate.

To illustrate the problem, look at the example code below, which ends up failing to calibrate all models:

library(modeltime.ensemble)
library(modeltime)
library(tidymodels)
library(earth)
library(glmnet)
library(xgboost)
library(tidyverse)
library(lubridate)
library(timetk)


FORECAST_HORIZON <- 24

m4_extended <- m4_monthly %>%
  group_by(id) %>%
  future_frame(
    .length_out = FORECAST_HORIZON,
    .bind_data  = TRUE
  ) %>%
  ungroup()

lag_transformer_grouped <- function(data){
  data %>%
    group_by(id) %>%
    tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
    ungroup()
}

m4_lags <- m4_extended %>%
  lag_transformer_grouped()

test_data <- m4_lags %>%
  group_by(id) %>%
  slice_tail(n = 12) %>%
  ungroup()

train_data <- m4_lags %>%
  drop_na()

future_data <- m4_lags %>%
  filter(is.na(value))

model_fit_glmnet <- linear_reg(penalty = 1) %>%
  set_engine("glmnet") %>%
  fit(value ~ ., data = train_data)

model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
  set_engine("xgboost") %>%
  fit(value ~ ., data = train_data)

recursive_ensemble_panel <- modeltime_table(
  model_fit_glmnet,
  model_fit_xgboost
) %>%
  ensemble_weighted(loadings = c(4, 6)) %>%
  recursive(
    transform  = lag_transformer_grouped,
    train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
    id         = "id"
  )

model_tbl <- modeltime_table(
  recursive_ensemble_panel
)

calibrated_mod <- model_tbl %>%
  modeltime_calibrate(test_data, id = "id", quiet = FALSE)

model_tbl %>%
  modeltime_forecast(
    new_data    = future_data,
    actual_data = m4_lags,
    keep_data   = TRUE
  ) %>%
  group_by(id) %>%
  plot_modeltime_forecast(
    .interactive        = FALSE,
    .conf_interval_show = TRUE,
    .facet_ncol         = 2
  )

Solution

  • I had to do some experimentation to find the right way to extract what I need (confidence intervals and residuals).

    As you can see from the example code below, there needs to be a change in the models workflow to achieve this. Recursion needs to appear in the workflow object definition and neither in the model nor in the ensemble fit/specification.

    I still have to do some tests here, but I guess, that I got what I need now:

    # Time Series ML
    library(tidymodels)
    library(modeltime)
    library(modeltime.ensemble)
    
    # Core
    library(tidyverse)
    library(timetk)
    
    
    # data def
    FORECAST_HORIZON <- 24
    
    lag_transformer_grouped <- function(m750){
      m750 %>%
        group_by(id) %>%
        tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
        ungroup()
    }
    
    m750_lags <- m750 %>%
      lag_transformer_grouped()
    
    test_data <- m750_lags %>%
      group_by(id) %>%
      slice_tail(n = 12) %>%
      ungroup()
    
    train_data <- m750_lags %>%
      drop_na()
    
    future_data <- m750_lags %>%
      filter(is.na(value))
    
    # rec
    recipe_spec <- recipe(value ~ date, train_data) %>%
      step_timeseries_signature(date) %>%
      step_rm(matches("(.iso$)|(.xts$)")) %>%
      step_normalize(matches("(index.num$)|(_year$)")) %>%
      step_dummy(all_nominal()) %>%
      step_fourier(date, K = 1, period = 12)
    
    recipe_spec %>% prep() %>% juice()
    
    # elnet 
    model_fit_glmnet <- linear_reg(penalty = 1) %>%
      set_engine("glmnet") 
    
    wflw_fit_glmnet <- workflow() %>%
      add_model(model_fit_glmnet) %>%
      add_recipe(recipe_spec %>% step_rm(date)) %>%
      fit(train_data) %>%
      recursive(
        transform  = lag_transformer_grouped,
        train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
        id         = "id"
      )
    
    # xgboost    
    model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
      set_engine("xgboost")
    
    wflw_fit_xgboost <- workflow() %>%
      add_model(model_fit_xgboost) %>%
      add_recipe(recipe_spec %>% step_rm(date)) %>%
      fit(train_data) %>%
      recursive(
        transform  = lag_transformer_grouped,
        train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
        id         = "id"
      )
    
    # mtbl
    m750_models <- modeltime_table(
      wflw_fit_xgboost,
      wflw_fit_glmnet
    )
    
    # mfit
    ensemble_fit <- m750_models %>%
     ensemble_average(type = "mean")
    
    # mcalib
    calibration_tbl <- modeltime_table(
      ensemble_fit
    ) %>%
      modeltime_calibrate(test_data)
    
    # residuals
    calib_out <- calibration_tbl$.calibration_data[[1]] %>% 
      left_join(test_data %>% select(id, date, value))
    
    
    # Forecast ex post
    ex_post_obj <- 
    calibration_tbl %>%
      modeltime_forecast(
        new_data    = test_data,
        actual_data = m750
      )
    
    
    # Forecast ex ante
    data_prepared_tbl <- bind_rows(train_data, test_data)
    
    future_tbl <- data_prepared_tbl %>%
      group_by(id) %>%
      future_frame(.length_out = "2 years") %>%
      ungroup()
    
    ex_ante_obj <- 
      calibration_tbl %>%
      modeltime_forecast(
        new_data    = future_tbl,
        actual_data = m750
      )