rforecastfable-rtsibble

Forecasting with ARIMA and dummy variables


I am attempting to include a dummy regressor that notes the beginning of the pandemic and runs a regression with ARIMA errors.

My dataset revolves around breaking & entering's happening in Toronto from 2014 to 2021. The issue is that the trend takes a turn due to covid-19 around 2020.

Auto.arima provides me with a ARIMA(1,0,1) model as it is not taking into account the impact of covid-19 and is performing according to the implied return to the series average.

When trying to include a dummy regressor that notes the beginning of the pandemic and run a regression with ARIMA errors I get the following error:

In ifelse(time(BEDATA_GROUPEDtsssarima) >= yearmonth("2020-03"),  :
  Incompatible methods ("Ops.ts", ">=.vctrs_vctr") for ">="

Code:

# Create a binary time series that indicates the start of the pandemic
library(fpp3)
library(forecast)
library(zoo)

# Check if timeseries
class(BEDATA_GROUPED)

#Convert timeseries
BEDATA_GROUPEDtsssarima <- ts(BEDATA_GROUPED[,2], frequency = 12, start = c(2014, 1))
class(BEDATA_GROUPEDtsssarima)

#Plot
forecast::autoplot(BEDATA_GROUPEDtsssarima)

# Assume that the pandemic began in March 2020
pandemic_dummy <- ifelse(time(BEDATA_GROUPEDtsssarima) >= yearmonth("2020-03"), 1, 0)

# Use auto.arima() to fit an ARIMA model with the dummy variable as an exogenous variable
beddatamodel <- auto.arima(BEDATA_GROUPEDtsssarima, xreg = pandemic_dummy, ic="aic", trace = TRUE)

# Create a binary time series that indicates the start of the pandemic
# In this example, we will assume that the pandemic began in March 2020
pandemic_dummy <- ifelse(time(BEDATA_GROUPEDtsssarima) >= yearmonth("2020-03"), 1, 0)

# Use auto.arima() to fit an ARIMA model with the dummy variable as an exogenous variable
beddatamodel <- auto.arima(BEDATA_GROUPEDtsssarima, xreg = pandemic_dummy, ic="aic", trace = TRUE)

# Create a binary time series for the forecast period that includes the pandemic dummy variable
forecast_period <- time(BEDATA_GROUPEDtsssarima)["2022/01/01/":"2023/12/31/"]
pandemic_dummy_forecast <- ifelse(forecast_period >= yearmonth("2020-03"), 1, 0)

# Use the forecast()
forecast(pandemic_dummy_forecast)

Dataset:

structure(list(occurrence_yrmn = c("2014-January", "2014-February", 
"2014-March", "2014-April", "2014-May", "2014-June", "2014-July", 
"2014-August", "2014-September", "2014-October", "2014-November", 
"2014-December", "2015-January", "2015-February", "2015-March", 
"2015-April", "2015-May", "2015-June", "2015-July", "2015-August", 
"2015-September", "2015-October", "2015-November", "2015-December", 
"2016-January", "2016-February", "2016-March", "2016-April", 
"2016-May", "2016-June", "2016-July", "2016-August", "2016-September", 
"2016-October", "2016-November", "2016-December", "2017-January", 
"2017-February", "2017-March", "2017-April", "2017-May", "2017-June", 
"2017-July", "2017-August", "2017-September", "2017-October", 
"2017-November", "2017-December", "2018-January", "2018-February", 
"2018-March", "2018-April", "2018-May", "2018-June", "2018-July", 
"2018-August", "2018-September", "2018-October", "2018-November", 
"2018-December", "2019-January", "2019-February", "2019-March", 
"2019-April", "2019-May", "2019-June", "2019-July", "2019-August", 
"2019-September", "2019-October", "2019-November", "2019-December", 
"2020-January", "2020-February", "2020-March", "2020-April", 
"2020-May", "2020-June", "2020-July", "2020-August", "2020-September", 
"2020-October", "2020-November", "2020-December", "2021-January", 
"2021-February", "2021-March", "2021-April", "2021-May", "2021-June", 
"2021-July", "2021-August", "2021-September", "2021-October", 
"2021-November", "2021-December"), MCI = c(586, 482, 567, 626, 
625, 610, 576, 634, 636, 663, 657, 556, 513, 415, 510, 542, 549, 
618, 623, 666, 641, 632, 593, 617, 541, 523, 504, 536, 498, 552, 
522, 519, 496, 541, 602, 570, 571, 492, 560, 525, 507, 523, 593, 
623, 578, 657, 683, 588, 664, 582, 619, 512, 630, 644, 563, 654, 
635, 732, 639, 748, 719, 567, 607, 746, 739, 686, 805, 762, 696, 
777, 755, 675, 704, 617, 732, 609, 464, 487, 565, 609, 513, 533, 
505, 578, 526, 418, 428, 421, 502, 452, 509, 492, 478, 469, 457, 
457)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-96L))

Solution

  • I see you have used the fpp3 library, so I've had a go using the tidyverts tools. I've had a go at three models: a plain ARIMA, a plain regression using the pandemic dummy variable, and a dynamic model using both ARIMA and the dummy variable. Hope this helps! :-)

    library(tsibble)
    library(fable)
    library(fabletools)
    library(feasts)
    library(dplyr)
    

    Create a tsibble:

    BEDATA_GROUPED <- BEDATA_GROUPED |>
      mutate(Month = yearmonth(occurrence_yrmn)) |>
      as_tsibble(index = Month)
    
    autoplot(BEDATA_GROUPED)
    

    Assume that the pandemic began in March 2020 and create a dummy variable:

    pandemic_start <- yearmonth("2020-03-01")
    BEDATA_GROUPED <- BEDATA_GROUPED |>
      mutate(pandemic_dummy = ifelse(Month >= pandemic_start, 1, 0))
    

    Work up a plain ARIMA:

    BEDATA_GROUPED_arima <- BEDATA_GROUPED |>
      model(ARIMA(MCI, stepwise = FALSE))
    
    BEDATA_GROUPED_arima |>
      gg_tsresiduals()
    
    BEDATA_GROUPED_arima |>
      forecast(h = 5) |>
      autoplot()
    

    Work up a plain regression:

    BEDATA_GROUPED_TSLM <- BEDATA_GROUPED |>
      model(TSLM(MCI ~ pandemic_dummy)) |>
      report()
    
    BEDATA_GROUPED_TSLM |>
      gg_tsresiduals()
    

    Make a data set to predict on:

    new_data <- structure(list(Month = structure(c(18993, 19024, 19052, 19083, 
                                                   19113), class = c("yearmonth", "vctrs_vctr")), pandemic_dummy = c(1, 
                                                                                                                     1, 1, 1, 1)), class = c("tbl_ts", "tbl_df", "tbl", "data.frame"
                                                                                                                     ), row.names = c(NA, -5L), key = structure(list(.rows = structure(list(
                                                                                                                       1:5), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
                                                                                                                                                           "list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
                                                                                                                                                                                                                              -1L)), index = structure("Month", ordered = TRUE), index2 = "Month", interval = structure(list(
                                                                                                                                                                                                                                year = 0, quarter = 0, month = 1, week = 0, day = 0, hour = 0, 
                                                                                                                                                                                                                                minute = 0, second = 0, millisecond = 0, microsecond = 0, 
                                                                                                                                                                                                                                nanosecond = 0, unit = 0), .regular = TRUE, class = c("interval", 
                                                                                                                                                                                                                                                                                      "vctrs_rcrd", "vctrs_vctr")))
    

    Forecast plain regression:

    BEDATA_GROUPED_TSLM |>
      forecast(new_data = new_data) |>
      autoplot()
    

    Work up a dynamic regression, with ARIMA and the pandemic dummy variablee:

    BEDATA_GROUPED_dyn_ARIMA <- BEDATA_GROUPED |>
      model(ARIMA(MCI ~ pandemic_dummy)) |>
      report()
    
    BEDATA_GROUPED_dyn_ARIMA |>
      gg_tsresiduals()
    
    BEDATA_GROUPED_dyn_ARIMA |>
      forecast(new_data = new_data) |>
      autoplot()