I would like to speed up this code in data.table. I think I can do better. It is a typical looking backwards for the last TRUE predicate an then accounts the time to the current row. All this by group. I am running this with a 300.000 rows and 200.000 groups, and need to calculate many columns (metrics) like that. Hence, the velocity is important to me.
I made a smaller data Example:
data <- data.table(
SYSKEY = c(
12, 13, 14, 15, 20,
22, 21, 24, 25, 26
),
Customer = c(
"John", "John", "John", "Tom", "Tom",
"Tom", "Sally", "Sally", "Sally", "Sally"
),
TRAN_DATTIM = as.Date(
c(
"28-02-2024", "28-02-2024", "02-03-2024", "02-03-2024", "02-03-2024",
"02-03-2024", "02-03-2024", "02-03-2024", "03-03-2024", "03-03-2024"
),
format="%d-%m-%Y", origin="01-01-1900"
),
Product = c(
"Eggs", "Milk", "Bread", "Butter","Eggs",
"Milk", "Bread", "Butter", "Eggs", "Wine"
)
)
My function and execution code:
library(data.table)
build_recency <- function(
data,
name,
predicate,
aggregated_fun = "TIME",
gamma = 0.0001,
rolling_over
){
UseMethod("build_recency")
}
build_recency.data.table <- function(
data,
name,
predicate,
aggregated_fun = "TIME",
gamma = 0.01,
rolling_over = "PAN"
){
predicate <- enexpr(predicate)
data[,
PRED := fifelse(eval(predicate) == TRUE, 1L, 0L)
]
setorderv(
data,
c(rolling_over, "TRAN_DATTIM", "SYSKEY")
)
# make a left-join, that trail last TRUE predicate PRED in their
# rolling over's group
data[,c(
.SD[
# last timestamp with PRED==TRUE
PRED == 1, .(TRAN_DATTIM, SYSKEY)
][
.SD,
# assure that do not get itself
on = .(TRAN_DATTIM <= TRAN_DATTIM, SYSKEY < SYSKEY),
# in that cartesian product, get the latest.
# DT must be ordered by TRAN_DATTIM!
mult = "last",
# j
# return all .SD cols, plus new column x.TRAN_DATTIM
# I help me with env argument.
# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-programming.html#:~:text=Substituting%20lists%20of%20arbitrary%20length
cols,
env = list(
cols = I(c(colnames(.SD), "x.TRAN_DATTIM"))
)
]
),
by = rolling_over
][,
# building recency and cleaning auxiliar vars
`:=` (
name = {
x <- as.numeric(
difftime(
TRAN_DATTIM,
x.TRAN_DATTIM,
units = "days"
)
)
if(aggregated_fun == "EXP")
x <- exp(-gamma * x)
x
},
x.TRAN_DATTIM = NULL,
PRED = NULL
),
env = list(
gamma = I(gamma),
name = name
)
]
}
data2 <- build_recency(
data = data,
name = "RECENCY",
predicate = if_else(
Product == 'Eggs', TRUE, FALSE
),
gamma = 0.001,
rolling_over = 'Customer'
)
data2[]
Expected result:
Customer SYSKEY TRAN_DATTIM Product RECENCY
<char> <num> <Date> <char> <num>
1: John 12 2024-02-28 Eggs NA
2: John 13 2024-02-28 Milk 0
3: John 14 2024-03-02 Bread 3
4: Sally 21 2024-03-02 Bread NA
5: Sally 24 2024-03-02 Butter NA
6: Sally 25 2024-03-03 Eggs NA
7: Sally 26 2024-03-03 Wine 0
8: Tom 15 2024-03-02 Butter NA
9: Tom 20 2024-03-02 Eggs NA
10: Tom 22 2024-03-02 Milk 0
A speedy data.table
-only version would be
data[ , PREDICATE_Date := fifelse(Product=="Eggs",TRAN_DATTIM,NA)
][ , PREDICATE_Date := nafill(PREDICATE_Date,"locf"), by=Customer
][ , RECENCY := TRAN_DATTIM-PREDICATE_Date
][ , RECENCY := fifelse(Product=="Eggs", NA, RECENCY)
]
short explanation:
PREDICATE_Date
PREDICATE_Date
with the most recent date of that customerNA
in predicate rows.but collapse
and timeplyr
are indeed insanely powerful as @NicChr already explained.
I think the most efficient way is to use collapse inside a data.table, i.e. just copy the approach of @NicChr into a data.table.
Let's do some benchmarking :)
I replicated your toy data set N=5e4 times to get a dataset with 150000 (unique) customers:
library(data.table)
library(dplyr)
library(collapse)
library(timeplyr)
N <- 5e4 ## Number of replicates of each customer
Data1 <- data[rep(1:10,N)] ## replicate data
Data1[,Customer:=paste0(Customer,rep(1:N,each=10))] ## make customer names unique
Data2 <- copy(Data1)
Data3 <- copy(Data1)
bench::mark(
dplyr.collapse={
out <- Data1 %>%
# arrange(Customer) %>%
mutate(id = fcumsum(Product == "Eggs", g = Customer)) %>%
mutate(RECENCY = time_elapsed(TRAN_DATTIM, "days", rolling = FALSE,
g = pick(Customer, id))) %>%
mutate(RECENCY = if_else(id == 0, NA, RECENCY)) %>%
mutate(RECENCY = if_else(frowid(pick(Customer, id)) == 1, NA, RECENCY))
},
datatable={
Data2[ , PREDICATE_Date := fifelse(Product=="Eggs",TRAN_DATTIM,NA)
][ , PREDICATE_Date := nafill(PREDICATE_Date,"locf"), by=Customer
][ , RECENCY := TRAN_DATTIM-PREDICATE_Date
][ , RECENCY := fifelse(Product=="Eggs", NA, RECENCY)
]
},
datatable.collapse={
Data3[, id:=fcumsum(Product=="Eggs",g=Customer)
][, RECENCY:=time_elapsed(TRAN_DATTIM, "days", rolling=FALSE,
g=GRP(list(Customer,id)) )
][, RECENCY:=fifelse(id==0L, NA, RECENCY)
][, RECENCY:=fifelse(frowid(GRP(list(Customer, id))) == 1L, NA, RECENCY)]
}, check=FALSE )
# A tibble: 3 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list>
1 dplyr.collapse 49.81s 49.81s 0.0201 72.56MB 0 1 0 49.81s <NULL> <Rprofmem>
2 datatable 9.67s 9.67s 0.103 2.34GB 1.45 1 14 9.67s <NULL> <Rprofmem>
3 datatable.collapse 224.11ms 252.68ms 3.96 53.53MB 1.98 2 1 505.36ms <NULL> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>
So, there are gigantic differences in speed.
The data.table
&collapse
&timeplyr
combination should manage your use case in well under a second :)
EDIT:
The main bottleneck in the dplyr approach is the usage of pick(...)
. When we replace it with GRP(list(...))
, we get speeds much more similar to that of the data.table version