rlag

imputing NA with combination of previous values without loop


continuing on last question, i need to impute missing last couple of years as combination of previous years in R. In the data below, i need to impute 2020 as linear combination of 2019,2018 and 2017, and thene 2021 as combination of 2020, 2019, 2018.

> comb1 <- cbind(CJ(letters[1:4], 2000:2019), rnorm(80,2,1)) 
> comb2 <-  cbind(CJ(letters[1:4], 2020:2024), data.frame(rep(NA, 20)))
> colnames(comb1) <- c("state","year","v")
> colnames(comb2) <- c("state","year","v")
> 
> comb <- rbind(comb1, comb2)
> comb <- comb[order(comb$state, comb$year),]
>

I can do it in a loop (below) but it it very slow as my data is large (i have many "states"). Does someone know how to do it in more efficient way?

> library(dplyr)
> vlist <- unique(comb$state)
> for (i in vlist) {
> for (j in (2002:2024)) {
> value <- 0.65*comb$v[comb$year==j-1 & comb$stat==i] + 0.25*comb$v[comb$year==j-2 & comb$state==i] + 0.10*comb$v[comb$year==j-3 & comb$state==i]
> if (is.na(comb$v[comb$year==j & comb$state==i])) {
> comb$v[comb$year==j & comb$state==i] <- value
>     }
>   }
> }
> 

I managed to do it partially by vectors so is faster, but more elegant solution would be nice though...

for (j in (2002:2024)) {
  
  comb$v_1 <- ave(comb$v, comb$state, FUN = dplyr::lag)
  comb$v_2 <- ave(comb$v_1, comb$state, FUN = dplyr::lag) 
  comb$v_3 <- ave(comb$v_2, comb$state, FUN = dplyr::lag) 
  
  comb$imp <- 0.65*comb$v_1 + 0.25*comb$v_2 + 0.10*comb$v_3
  comb$v <- ifelse(is.na(comb$v)&comb$year==j, comb$imp, comb$v)
  
 comb$v_1 <- NULL
 comb$v_2 <- NULL
 comb$v_3 <- NULL
 comb$imp <- NULL
}

Solution

  • A bit convoluted, but does the trick:

    First, create a simple function to calculate the value to be imputed, from the last three numbers on a vector:

    impute <- function(x) {
      sum(tail(x, 3) * c(0.1, 0.25, 0.65))
    }
    

    Then another simple function that calls this function whenever an NA is found (or just moves forward along the vector):

    roll_impute <- function(x, y) {
      if (is.na(x)) {
        c(x, impute(x))
      } else {
        c(x, y)
      }
    }
    

    Now just use reduce() from the purrr package to recursively apply this function to v. Since you want this to work separately per state, we group the data.frame beforehand:

    comb %>%
      group_by(state) %>%
      mutate(v = reduce(v, roll_impute))
    

    Edit:

    The first two steps can obviously be simplified into:

    roll_impute <- function(x, y) {
      if (is.na(x)) {
        c(x, sum(tail(x, 3) * c(0.1, 0.25, 0.65)))
      } else {
        c(x, y)
      }
    }
    

    Also, running microbenchmark on a larger set (26 states) shows the speedup is about 14-fold, on average:

    library(microbenchmark)
    
    method_a <- function(comb) {
      vlist <- unique(comb$state)
        for (i in vlist) {
          for (j in (2002:2024)) {
            value <- 0.65*comb$v[comb$year==j-1 & comb$stat==i] + 0.25*comb$v[comb$year==j-2 & comb$state==i] + 0.10*comb$v[comb$year==j-3 & comb$state==i]
            if (is.na(comb$v[comb$year==j & comb$state==i])) {
              comb$v[comb$year==j & comb$state==i] <- value
            }
          }
        }
      comb
    }
    
    method_b <- function(comb) {
        comb %>%
        group_by(state) %>%
        mutate(v = reduce(v, roll_impute))
    }
    
    comb <-
      bind_rows(
        expand.grid(state = letters, year = 2000:2019, stringsAsFactors = F) %>%
        mutate(v = rnorm(nrow(.))),
        expand.grid(state = letters, year = 2020:2024, stringsAsFactors = F) %>%
          mutate(v = NA)
      )
    
    microbenchmark(method_a(comb), method_b(comb))
    
    Unit: milliseconds
               expr       min        lq       mean    median         uq        max neval cld
     method_a(comb) 60.616898 61.250113 110.456925 61.890937 177.375896 317.968330   100  a
     method_b(comb)  7.357221  7.473352   7.943528  8.070393   8.240969   9.690326   100   b