I have a large dataset that I need to predict some values for. I have this small reproducible example below. I would like to predict for a few years ahead using either gam or glm, however, I can't wrap my head around it since I don't have statistics background. Could some one guide me on how to predict and add the predicted values to my original dataset?
library(mgcv)
library(tidyverse)
m <- structure(list(year = c(2003, 2003, 2003, 2003, 2003, 2003, 2003,
2003, 2003, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004,
2005, 2005, 2005, 2006, 2006, 2006, 2006, 2006, 2007, 2007, 2007,
2007, 2007), month = c("August", "August", "September", "September",
"October", "October", "November", "November", "December", "August",
"August", "September", "September", "October", "October", "November",
"November", "December", "August", "August", "September", "October",
"October", "November", "November", "December", "August", "August",
"September", "September", "October"), date = structure(c(12265,
12281, 12296, 12311, 12326, 12342, 12357, 12372, 12387, 12631,
12647, 12662, 12677, 12692, 12708, 12723, 12738, 12753, 12996,
13012, 13027, 13422, 13438, 13453, 13468, 13483, 13726, 13742,
13757, 13772, 13787), class = "Date"), turb = c(12.5, 11.8, 8.9,
6.2, 5.1, 3.4, -0.1, 0, -0.1, 12.5, 10.4, 8.6, 6.1, 4.7, 1.4,
0.1, -0.1, -0.1, 12.5, 11, 8, 4.4, 1.1, 0.5, -0.2, -0.2, 12.5,
11.7, 10, 5.9, 3.6)), row.names = c(NA, -31L), class = c("tbl_df",
"tbl", "data.frame"))
m$date <- as.numeric(m$date)
mod1 <- gam(turb ~ s(date), data = m)
summary(mod1)
#How can I predict 3 more years (to 2010) and have those predictions
#added to the bottom of the original dataset(m)?
Perhaps I'm wrong, but I think you're trying to predict turb
based on the smoothed date
. If so, then
m$datenum <- as.numeric(m$date) # just to keep `date` unchanged
mod1 <- mgcv::gam(turb ~ s(datenum), data = m)
new_m <- tibble(date = c(outer(max(m$date) %m+% months(0:36), c(0, 15), `+`))[-1]) %>%
mutate(datenum = as.numeric(date))
new_m
# # A tibble: 73 × 2
# date datenum
# <date> <dbl>
# 1 2007-11-01 13818
# 2 2007-12-01 13848
# 3 2008-01-01 13879
# 4 2008-02-01 13910
# 5 2008-03-01 13939
# 6 2008-04-01 13970
# 7 2008-05-01 14000
# 8 2008-06-01 14031
# 9 2008-07-01 14061
# 10 2008-08-01 14092
# # ℹ 63 more rows
# # ℹ Use `print(n = ...)` to see more rows
predict(mod1, newdata = new_m)
# 1 2 3 4 5 6 7 8 9 10 11 12 13
# -1.246062 -6.392131 -11.709736 -17.027341 -22.001874 -27.319479 -32.465548 -37.783153 -42.929222 -48.246827 -53.564432 -58.710501 -64.028106
# 14 15 16 17 18 19 20 21 22 23 24 25 26
# -69.174175 -74.491780 -79.809385 -84.612383 -89.929987 -95.076057 -100.393661 -105.539731 -110.857335 -116.174940 -121.321010 -126.638614 -131.784684
# 27 28 29 30 31 32 33 34 35 36 37 38 39
# -137.102288 -142.419893 -147.222891 -152.540496 -157.686565 -163.004170 -168.150239 -173.467844 -178.785449 -183.931518 1.498509 -3.819096 -8.965165
# 40 41 42 43 44 45 46 47 48 49 50 51 52
# -14.282770 -19.600375 -24.574909 -29.892514 -35.038583 -40.356188 -45.502257 -50.819862 -56.137466 -61.283536 -66.601140 -71.747210 -77.064814
# 53 54 55 56 57 58 59 60 61 62 63 64 65
# -82.382419 -87.185417 -92.503022 -97.649091 -102.966696 -108.112765 -113.430370 -118.747975 -123.894044 -129.211649 -134.357718 -139.675323 -144.992928
# 66 67 68 69 70 71 72 73
# -149.795926 -155.113531 -160.259600 -165.577205 -170.723274 -176.040879 -181.358483 -186.504553
This can be combined with the original data with
combined <- new_m %>%
mutate(turb = predict(mod1, newdata = cur_data()), year = as.integer(format(date, format="%Y")), month = format(date, format = "%B")) %>%
bind_rows(m) %>%
arrange(date)
combined
# # A tibble: 104 × 5
# date datenum turb year month
# <date> <dbl> <dbl[1d]> <dbl> <chr>
# 1 2003-08-01 12265 12.5 2003 August
# 2 2003-08-17 12281 11.8 2003 August
# 3 2003-09-01 12296 8.9 2003 September
# 4 2003-09-16 12311 6.2 2003 September
# 5 2003-10-01 12326 5.1 2003 October
# 6 2003-10-17 12342 3.4 2003 October
# 7 2003-11-01 12357 -0.1 2003 November
# 8 2003-11-16 12372 0 2003 November
# 9 2003-12-01 12387 -0.1 2003 December
# 10 2004-08-01 12631 12.5 2004 August
# # ℹ 94 more rows
# # ℹ Use `print(n = ...)` to see more rows
tail(combined)
# # A tibble: 6 × 5
# date datenum turb year month
# <date> <dbl> <dbl[1d]> <dbl> <chr>
# 1 2010-08-01 14822 -173. 2010 August
# 2 2010-08-16 14837 -176. 2010 August
# 3 2010-09-01 14853 -179. 2010 September
# 4 2010-09-16 14868 -181. 2010 September
# 5 2010-10-01 14883 -184. 2010 October
# 6 2010-10-16 14898 -187. 2010 October