rmissing-datalagmoving-average

R - moving average that treats missing years correctly


In R I want to create a new column that is a moving average of the previous (i.e. exclude current year) n years. However some years are missing, and when that is the case, I would like the moving average calculated on fewer years. Currently the code I have ignores the TIME_PERIOD information and instead just calculates based on lag of previous n years.

Hopefully my example code will make clear what it is that I am after

library(dplyr)

# Create example data frame

df <- data.frame(
  NUTS = rep("XX", 12),
  TIME_PERIOD = c(2008, 2009, 2010, 2011, 
                  # note 2012 is missing
                  2013, 2014, 2015, 
                  # note 2016 is missing
                  2017, 2018, 2019, 2020, 2021),
  DATA = c(524288.2008, 1048576.201, 2097152.201, 4194304.201, 16777216.2, 33554432.2, 67108864.2, 268435456.2, 536870912.2, 1073741824, 2147483648, 4294967296),
  INDICATOR_CODE = rep("Dummy", 12),
  wanted_calculation= c(NA, NA, NA, "DATA average of 2008, 2009, 2010",
      "DATA average of 2009, 2010, 2011",
      "2012 is missing so DATA average of 2011, 2013",
      "2012 is missing so DATA average of 2013, 2014",
      "DATA average of 2013, 2014, 2015",
      "2016 is missing so DATA average of 2015, 2017",
      "2016 is missing so DATA average of 2017, 2018",
      "DATA average of 2017, 2018, 2019",
      "DATA average of 2018, 2019, 2020"
  )
)


# Sort the dataframe by INDICATOR_CODE, NUTS, and TIME_PERIOD
df <- df %>% arrange(INDICATOR_CODE, NUTS, TIME_PERIOD)

# Calculate a moving average (of specified years ) excluding current year with dynamic column naming
moving_avg_length <- 3  # compare data with previous chosen number of years
df <- df %>%
  group_by(INDICATOR_CODE, NUTS) %>%
  mutate(my_bugged_moving_avg = sapply(1:n(), function(i) {
    if (i <= moving_avg_length || INDICATOR_CODE[i] != INDICATOR_CODE[i-1] || NUTS[i] != NUTS[i-1]) { # resets the calculation when changing NUTS or Indicators
      return(NA)
    } else {
      return(mean(DATA[(i - moving_avg_length):(i-1)], na.rm = TRUE))
    }
  })) %>%
  ungroup()


wanted_result <- data.frame(
  wanted_moving_avg = c(NA, NA, NA, 1223338.868, 2446677.534, 10485760.2, 25165824.2, 39146837.53, 167772160.2, 402653184.2, 626349397.5, 1252698795)
)

df <- df%>%
  cbind(wanted_result)

rm(wanted_result)



Solution

  • Here is a solution with tidyr::complete and zoo::rollapply.

    library(tidyverse)
    library(zoo)
    #> 
    #> Attaching package: 'zoo'
    #> The following objects are masked from 'package:base':
    #> 
    #>     as.Date, as.Date.numeric
    
    # Create example data frame
    df <- data.frame(
      NUTS = rep("XX", 12),
      TIME_PERIOD = c(2008, 2009, 2010, 2011, 
                      # note 2012 is missing
                      2013, 2014, 2015, 
                      # note 2016 is missing
                      2017, 2018, 2019, 2020, 2021),
      DATA = c(524288.2008, 1048576.201, 2097152.201, 4194304.201, 16777216.2, 33554432.2, 67108864.2, 268435456.2, 
               536870912.2, 1073741824, 2147483648, 4294967296),
      INDICATOR_CODE = rep("Dummy", 12),
      wanted_calculation= c(NA, NA, NA, "DATA average of 2008, 2009, 2010",
                            "DATA average of 2009, 2010, 2011",
                            "2012 is missing so DATA average of 2011, 2013",
                            "2012 is missing so DATA average of 2013, 2014",
                            "DATA average of 2013, 2014, 2015",
                            "2016 is missing so DATA average of 2015, 2017",
                            "2016 is missing so DATA average of 2017, 2018",
                            "DATA average of 2017, 2018, 2019",
                            "DATA average of 2018, 2019, 2020"
      )
    )
    
    
    # Sort the dataframe by INDICATOR_CODE, NUTS, and TIME_PERIOD
    df <- df %>% arrange(INDICATOR_CODE, NUTS, TIME_PERIOD)
    
    # solution with `rollapply` and `lag`
    df |> 
      complete(TIME_PERIOD = seq(min(df$TIME_PERIOD), max(df$TIME_PERIOD))) |> 
      mutate(
        roll_mean = DATA |> 
          rollapply(width = 3, align = "right", FUN = mean, fill = NA, na.rm = TRUE) |> 
          lag()
      ) |> 
      select(-NUTS, -INDICATOR_CODE)
    #> # A tibble: 14 × 4
    #>    TIME_PERIOD        DATA wanted_calculation                          roll_mean
    #>          <dbl>       <dbl> <chr>                                           <dbl>
    #>  1        2008     524288. <NA>                                          NA     
    #>  2        2009    1048576. <NA>                                          NA     
    #>  3        2010    2097152. <NA>                                          NA     
    #>  4        2011    4194304. DATA average of 2008, 2009, 2010               1.22e6
    #>  5        2012         NA  <NA>                                           2.45e6
    #>  6        2013   16777216. DATA average of 2009, 2010, 2011               3.15e6
    #>  7        2014   33554432. 2012 is missing so DATA average of 2011, 2…    1.05e7
    #>  8        2015   67108864. 2012 is missing so DATA average of 2013, 2…    2.52e7
    #>  9        2016         NA  <NA>                                           3.91e7
    #> 10        2017  268435456. DATA average of 2013, 2014, 2015               5.03e7
    #> 11        2018  536870912. 2016 is missing so DATA average of 2015, 2…    1.68e8
    #> 12        2019 1073741824  2016 is missing so DATA average of 2017, 2…    4.03e8
    #> 13        2020 2147483648  DATA average of 2017, 2018, 2019               6.26e8
    #> 14        2021 4294967296  DATA average of 2018, 2019, 2020               1.25e9
    

    Created on 2023-11-28 with reprex v2.0.2