rdplyrdata.tabledtplyr

Create tables by using data.table and a for loop for multiple columns


I need to speed up code using data.table. I am getting stuck on how to reference variables that are being indexed from a vector.

data:

df <- data.frame(
  id=c(1,1,1,2,2,2,3,3,3),
  year=as.character(c(2014, 2015, 2016, 2015, 2015, 2016, NA, NA, 2016)),
  code=c(1,2,2, 1,2,3, 3,4,5),
  dv1=1:9,
  dv2=2:10
) %>% as.data.table()

dtplyr code:

cols <- c("dv1", "dv2")

test <- function(data, columns, group) {
for(i in seq_along(columns)) {
 sub1 <- df %>% 
   select("id", columns[i], group) %>%
   group_by(.data[[group]]) %>%
   summarise(mean=mean(.data[[columns[i]]], na.rm=T), sd=sd(.data[[columns[i]]], na.rm=T)) %>%
   ungroup() %>%
   as_tibble() 
 print(sub1)
}
}

data.table attempt:

test <- function(data, columns, group) {
  for(i in seq_along(columns)) {
    sub1 <- df %>% 
      .[, .(id, columns[i], group)] %>%
      .[, .(mean(.data[[columns[i]]], na.rm=T), sd=sd(.data[[columns[i]]], na.rm=T)), by=.data[[group]]] %>%
      as_tibble() 
    print(sub1)
  }
}

test(data=df, columns=cols, group="year")

This works on a single variable:

df %>% 
  .[, .(id, dv1, year)] %>%
  .[, .(mean(dv1, na.rm=T), sd=sd(dv1, na.rm=T)), by=year] %>%
  as_tibble() 

Solution

  • Since this is just an example I have not tried to simplify the loop so that you can add more complicated stuff in there later.

    library(data.table)
    
    cols <- c("dv1", "dv2")
    
    test <- function(data, columns, group) {
      for(i in columns) {
        sub1 <-df[, .(mean(get(i), na.rm=T), sd=sd(get(i), na.rm=T)), by=year]
        print(sub1)
      }
    }
    
    test(data=df, columns=cols, group="year")
    
    #   year   V1    sd
    #1: 2014 1.00    NA
    #2: 2015 3.67 1.528
    #3: 2016 6.00 3.000
    #4: <NA> 7.50 0.707
    
    #   year   V1    sd
    #1: 2014 2.00    NA
    #2: 2015 4.67 1.528
    #3: 2016 7.00 3.000
    #4: <NA> 8.50 0.707