rmgcvtimetk

How to predict and add values to original dataset


I have a large dataset that I need to predict some values for. I have this small reproducible example below. I would like to predict for a few years ahead using either gam or glm, however, I can't wrap my head around it since I don't have statistics background. Could some one guide me on how to predict and add the predicted values to my original dataset?

library(mgcv)
library(tidyverse)
m <- structure(list(year = c(2003, 2003, 2003, 2003, 2003, 2003, 2003, 
2003, 2003, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 
2005, 2005, 2005, 2006, 2006, 2006, 2006, 2006, 2007, 2007, 2007, 
2007, 2007), month = c("August", "August", "September", "September", 
"October", "October", "November", "November", "December", "August", 
"August", "September", "September", "October", "October", "November", 
"November", "December", "August", "August", "September", "October", 
"October", "November", "November", "December", "August", "August", 
"September", "September", "October"), date = structure(c(12265, 
12281, 12296, 12311, 12326, 12342, 12357, 12372, 12387, 12631, 
12647, 12662, 12677, 12692, 12708, 12723, 12738, 12753, 12996, 
13012, 13027, 13422, 13438, 13453, 13468, 13483, 13726, 13742, 
13757, 13772, 13787), class = "Date"), turb = c(12.5, 11.8, 8.9, 
6.2, 5.1, 3.4, -0.1, 0, -0.1, 12.5, 10.4, 8.6, 6.1, 4.7, 1.4, 
0.1, -0.1, -0.1, 12.5, 11, 8, 4.4, 1.1, 0.5, -0.2, -0.2, 12.5, 
11.7, 10, 5.9, 3.6)), row.names = c(NA, -31L), class = c("tbl_df", 
"tbl", "data.frame"))

m$date <- as.numeric(m$date)
mod1 <- gam(turb ~ s(date), data = m)
summary(mod1)

#How can I predict 3 more years (to 2010) and have those predictions
#added to the bottom of the original dataset(m)?

Solution

  • Perhaps I'm wrong, but I think you're trying to predict turb based on the smoothed date. If so, then

    m$datenum <- as.numeric(m$date) # just to keep `date` unchanged
    mod1 <- mgcv::gam(turb ~ s(datenum), data = m)
    new_m <- tibble(date = c(outer(max(m$date) %m+% months(0:36), c(0, 15), `+`))[-1]) %>%
      mutate(datenum = as.numeric(date))
    new_m
    # # A tibble: 73 × 2
    #    date       datenum
    #    <date>       <dbl>
    #  1 2007-11-01   13818
    #  2 2007-12-01   13848
    #  3 2008-01-01   13879
    #  4 2008-02-01   13910
    #  5 2008-03-01   13939
    #  6 2008-04-01   13970
    #  7 2008-05-01   14000
    #  8 2008-06-01   14031
    #  9 2008-07-01   14061
    # 10 2008-08-01   14092
    # # ℹ 63 more rows
    # # ℹ Use `print(n = ...)` to see more rows
    predict(mod1, newdata = new_m)
    #           1           2           3           4           5           6           7           8           9          10          11          12          13 
    #   -1.246062   -6.392131  -11.709736  -17.027341  -22.001874  -27.319479  -32.465548  -37.783153  -42.929222  -48.246827  -53.564432  -58.710501  -64.028106 
    #          14          15          16          17          18          19          20          21          22          23          24          25          26 
    #  -69.174175  -74.491780  -79.809385  -84.612383  -89.929987  -95.076057 -100.393661 -105.539731 -110.857335 -116.174940 -121.321010 -126.638614 -131.784684 
    #          27          28          29          30          31          32          33          34          35          36          37          38          39 
    # -137.102288 -142.419893 -147.222891 -152.540496 -157.686565 -163.004170 -168.150239 -173.467844 -178.785449 -183.931518    1.498509   -3.819096   -8.965165 
    #          40          41          42          43          44          45          46          47          48          49          50          51          52 
    #  -14.282770  -19.600375  -24.574909  -29.892514  -35.038583  -40.356188  -45.502257  -50.819862  -56.137466  -61.283536  -66.601140  -71.747210  -77.064814 
    #          53          54          55          56          57          58          59          60          61          62          63          64          65 
    #  -82.382419  -87.185417  -92.503022  -97.649091 -102.966696 -108.112765 -113.430370 -118.747975 -123.894044 -129.211649 -134.357718 -139.675323 -144.992928 
    #          66          67          68          69          70          71          72          73 
    # -149.795926 -155.113531 -160.259600 -165.577205 -170.723274 -176.040879 -181.358483 -186.504553 
    

    This can be combined with the original data with

    combined <- new_m %>%
      mutate(turb = predict(mod1, newdata = cur_data()), year = as.integer(format(date, format="%Y")), month = format(date, format = "%B")) %>%
      bind_rows(m) %>%
      arrange(date)
    combined
    # # A tibble: 104 × 5
    #    date       datenum      turb  year month    
    #    <date>       <dbl> <dbl[1d]> <dbl> <chr>    
    #  1 2003-08-01   12265      12.5  2003 August   
    #  2 2003-08-17   12281      11.8  2003 August   
    #  3 2003-09-01   12296       8.9  2003 September
    #  4 2003-09-16   12311       6.2  2003 September
    #  5 2003-10-01   12326       5.1  2003 October  
    #  6 2003-10-17   12342       3.4  2003 October  
    #  7 2003-11-01   12357      -0.1  2003 November 
    #  8 2003-11-16   12372       0    2003 November 
    #  9 2003-12-01   12387      -0.1  2003 December 
    # 10 2004-08-01   12631      12.5  2004 August   
    # # ℹ 94 more rows
    # # ℹ Use `print(n = ...)` to see more rows
    tail(combined)
    # # A tibble: 6 × 5
    #   date       datenum      turb  year month    
    #   <date>       <dbl> <dbl[1d]> <dbl> <chr>    
    # 1 2010-08-01   14822     -173.  2010 August   
    # 2 2010-08-16   14837     -176.  2010 August   
    # 3 2010-09-01   14853     -179.  2010 September
    # 4 2010-09-16   14868     -181.  2010 September
    # 5 2010-10-01   14883     -184.  2010 October  
    # 6 2010-10-16   14898     -187.  2010 October