rdplyrdata.table

data.table vs dplyr: apply function returning changing column names over groups


I want to apply a function (ratefunc()) to a grouped data frame which returns changing column names dependent on the result:

library(data.table)
library(dplyr)

dt <- data.table:::data.table(
  group=c(rep("A", 3), rep("B", 3)), 
  x=c(1:3, 6:8), 
  y=c(4:6, 9:11)
)

ratefunc <- function(data, x_col="x", y_col="y") {
  res <- sum(data[[x_col]] + data[[y_col]])
  if (res < 25) {
    return(
      data.frame(a=rep("a", nrow(data)))
    )
  } else {
    return(
      data.frame(b=rep("b", nrow(data)))
    )
  }
}

dplyr returns the desired result by

dt %>% 
  group_by(group) %>% 
  group_modify(
    ~ratefunc(data=.)
  )

Console output:

# A tibble: 6 × 3
# Groups:   group [2]
  group a     b    
  <chr> <chr> <chr>
1 A     a     NA   
2 A     a     NA   
3 A     a     NA   
4 B     NA    b    
5 B     NA    b    
6 B     NA    b    

data.table instead coerces the results into one column ignoring the differently named columns:

dt[
  ,
  ratefunc(
    data=.SD
  )
  ,
  by=group
]

Console output:

    group      a
   <char> <char>
1:      A      a
2:      A      a
3:      A      a
4:      B      b
5:      B      b
6:      B      b

How can I get the identical result as in dplyr when using data.table?

And how is the approach of selecting the data frame columns by column name in ratefunc() (i.e. x_col and y_col being string inputs) to be evaluated in contrast to selecting them directly (i.e. x_col and y_col as vector/column inputs)?


Solution

  • Your ratefunc is not something I would create. I would do something like this. This could be wrapped in a function if necessary.

    dt[, tmp := fifelse(sum(Reduce(`+`, .SD)) < 25, "a", "b"), 
       by = group, .SDcols = c("x", "y")]
    dt[, ind := .I]
    dcast(dt, group + ind ~ tmp, value.var = "tmp")
    #Key: <group, ind>
    #    group   ind      a      b
    #   <char> <int> <char> <char>
    #1:      A     1      a   <NA>
    #2:      A     2      a   <NA>
    #3:      A     3      a   <NA>
    #4:      B     4   <NA>      b
    #5:      B     5   <NA>      b
    #6:      B     6   <NA>      b
    

    Edit adressing additional requirements from comments:

    dt[, colnames := fifelse(sum(Reduce(`+`, .SD)) < 25, "col1", "col2"), 
       by = group, .SDcols = c("x", "y")]
    
    dt[data.table(colnames = c("col1", "col2"),
                  colvalues = c("a", "b")), 
       colvalues := i.colvalues, on = .(colnames)]
    
    dt[, ind := .I]
    dcast(dt, group + ind ~ colnames, value.var = "colvalues")
    # Key: <group, ind>
    #     group   ind   col1   col2
    #    <char> <int> <char> <char>
    # 1:      A     1      a   <NA>
    # 2:      A     2      a   <NA>
    # 3:      A     3      a   <NA>
    # 4:      B     4   <NA>      b
    # 5:      B     5   <NA>      b
    # 6:      B     6   <NA>      b