In R I want to create a new column that is a moving average of the previous (i.e. exclude current year) n years. However some years are missing, and when that is the case, I would like the moving average calculated on fewer years. Currently the code I have ignores the TIME_PERIOD information and instead just calculates based on lag of previous n years.
Hopefully my example code will make clear what it is that I am after
library(dplyr)
# Create example data frame
df <- data.frame(
NUTS = rep("XX", 12),
TIME_PERIOD = c(2008, 2009, 2010, 2011,
# note 2012 is missing
2013, 2014, 2015,
# note 2016 is missing
2017, 2018, 2019, 2020, 2021),
DATA = c(524288.2008, 1048576.201, 2097152.201, 4194304.201, 16777216.2, 33554432.2, 67108864.2, 268435456.2, 536870912.2, 1073741824, 2147483648, 4294967296),
INDICATOR_CODE = rep("Dummy", 12),
wanted_calculation= c(NA, NA, NA, "DATA average of 2008, 2009, 2010",
"DATA average of 2009, 2010, 2011",
"2012 is missing so DATA average of 2011, 2013",
"2012 is missing so DATA average of 2013, 2014",
"DATA average of 2013, 2014, 2015",
"2016 is missing so DATA average of 2015, 2017",
"2016 is missing so DATA average of 2017, 2018",
"DATA average of 2017, 2018, 2019",
"DATA average of 2018, 2019, 2020"
)
)
# Sort the dataframe by INDICATOR_CODE, NUTS, and TIME_PERIOD
df <- df %>% arrange(INDICATOR_CODE, NUTS, TIME_PERIOD)
# Calculate a moving average (of specified years ) excluding current year with dynamic column naming
moving_avg_length <- 3 # compare data with previous chosen number of years
df <- df %>%
group_by(INDICATOR_CODE, NUTS) %>%
mutate(my_bugged_moving_avg = sapply(1:n(), function(i) {
if (i <= moving_avg_length || INDICATOR_CODE[i] != INDICATOR_CODE[i-1] || NUTS[i] != NUTS[i-1]) { # resets the calculation when changing NUTS or Indicators
return(NA)
} else {
return(mean(DATA[(i - moving_avg_length):(i-1)], na.rm = TRUE))
}
})) %>%
ungroup()
wanted_result <- data.frame(
wanted_moving_avg = c(NA, NA, NA, 1223338.868, 2446677.534, 10485760.2, 25165824.2, 39146837.53, 167772160.2, 402653184.2, 626349397.5, 1252698795)
)
df <- df%>%
cbind(wanted_result)
rm(wanted_result)
Here is a solution with tidyr::complete
and zoo::rollapply
.
library(tidyverse)
library(zoo)
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#> as.Date, as.Date.numeric
# Create example data frame
df <- data.frame(
NUTS = rep("XX", 12),
TIME_PERIOD = c(2008, 2009, 2010, 2011,
# note 2012 is missing
2013, 2014, 2015,
# note 2016 is missing
2017, 2018, 2019, 2020, 2021),
DATA = c(524288.2008, 1048576.201, 2097152.201, 4194304.201, 16777216.2, 33554432.2, 67108864.2, 268435456.2,
536870912.2, 1073741824, 2147483648, 4294967296),
INDICATOR_CODE = rep("Dummy", 12),
wanted_calculation= c(NA, NA, NA, "DATA average of 2008, 2009, 2010",
"DATA average of 2009, 2010, 2011",
"2012 is missing so DATA average of 2011, 2013",
"2012 is missing so DATA average of 2013, 2014",
"DATA average of 2013, 2014, 2015",
"2016 is missing so DATA average of 2015, 2017",
"2016 is missing so DATA average of 2017, 2018",
"DATA average of 2017, 2018, 2019",
"DATA average of 2018, 2019, 2020"
)
)
# Sort the dataframe by INDICATOR_CODE, NUTS, and TIME_PERIOD
df <- df %>% arrange(INDICATOR_CODE, NUTS, TIME_PERIOD)
# solution with `rollapply` and `lag`
df |>
complete(TIME_PERIOD = seq(min(df$TIME_PERIOD), max(df$TIME_PERIOD))) |>
mutate(
roll_mean = DATA |>
rollapply(width = 3, align = "right", FUN = mean, fill = NA, na.rm = TRUE) |>
lag()
) |>
select(-NUTS, -INDICATOR_CODE)
#> # A tibble: 14 × 4
#> TIME_PERIOD DATA wanted_calculation roll_mean
#> <dbl> <dbl> <chr> <dbl>
#> 1 2008 524288. <NA> NA
#> 2 2009 1048576. <NA> NA
#> 3 2010 2097152. <NA> NA
#> 4 2011 4194304. DATA average of 2008, 2009, 2010 1.22e6
#> 5 2012 NA <NA> 2.45e6
#> 6 2013 16777216. DATA average of 2009, 2010, 2011 3.15e6
#> 7 2014 33554432. 2012 is missing so DATA average of 2011, 2… 1.05e7
#> 8 2015 67108864. 2012 is missing so DATA average of 2013, 2… 2.52e7
#> 9 2016 NA <NA> 3.91e7
#> 10 2017 268435456. DATA average of 2013, 2014, 2015 5.03e7
#> 11 2018 536870912. 2016 is missing so DATA average of 2015, 2… 1.68e8
#> 12 2019 1073741824 2016 is missing so DATA average of 2017, 2… 4.03e8
#> 13 2020 2147483648 DATA average of 2017, 2018, 2019 6.26e8
#> 14 2021 4294967296 DATA average of 2018, 2019, 2020 1.25e9
Created on 2023-11-28 with reprex v2.0.2