I have a dataset with data about subscriptions. For each subscription there is the subscriber, the type of subscription, the subscription issue date, the subscription start date, and the subscription end date. One subscriber might have multiple subscriptions of the same type at the same time (i.e. the date intervals might be overlapping).
For each subscription I need to calculate the number of subscription days that the specific subscriber had for that specific type of subscription during the 1826 days (approximately 5 years) before the issue date of that specific subscription.
I wrote a functioning, but slow code. I tried speeding it up by using apply/mapply and purrr, but didn't get that to work. Also I wasn't sure how to re-write it as a vectorized function.
How could I speed up the following working code?
library(lubridate)
library(data.table)
# create example data
df <-
data.frame(
SUBSCRIBER = c("A", "A", "A", "A", "A", "A", "B", "B", "B"),
SUBSCRIPTION_TYPE = c("X", "X", "X", "X", "Z", "Z", "X", "X", "X"),
SUBSCRIPTION_ISSUE = c(
"2021-12-31",
"2022-01-02",
"2022-12-21",
"2023-01-01",
"2025-01-01",
"2025-01-03",
"2023-01-01",
"2025-01-01",
"2025-01-03"
),
SUBSCRIPTION_START = c(
"2022-01-01",
"2022-01-03",
"2023-01-01",
"2023-01-03",
"2025-01-01",
"2025-01-03",
"2023-01-03",
"2025-01-01",
"2025-01-03"
),
SUBSCRIPTION_END = c(
"2022-01-05",
"2022-01-07",
"2023-01-05",
"2023-01-07",
"2025-01-05",
"2025-01-07",
"2023-01-07",
"2025-01-05",
"2025-01-07"
)
)
# convert date columns to Date format
df$SUBSCRIPTION_ISSUE <- as.Date(df$SUBSCRIPTION_ISSUE)
df$SUBSCRIPTION_START <- as.Date(df$SUBSCRIPTION_START)
df$SUBSCRIPTION_END <- as.Date(df$SUBSCRIPTION_END)
# Convert data frame to data table
dt <- as.data.table(df)
# create a function to calculate the cumulative day count for each subscriber, subscription type, and issue date
calc_cumulative_days <-
function(dtinput,
this_SUBSCRIBER,
this_SUBSCRIPTION_TYPE,
this_SUBSCRIPTION_ISSUE) {
# filter the rows within the sliding window
dtsubset <-
dtinput[SUBSCRIBER == this_SUBSCRIBER &
SUBSCRIPTION_TYPE == this_SUBSCRIPTION_TYPE &
SUBSCRIPTION_START < this_SUBSCRIPTION_ISSUE &
SUBSCRIPTION_END > this_SUBSCRIPTION_ISSUE - 1826, ]
dtsubset$days <- 1
# create a data table with all dates within the sliding window
dates <-
data.table(date = seq(
from = this_SUBSCRIPTION_ISSUE[1] - 1,
length.out = 1826,
by = "-1 day"
))
# convert date variable to class Date
dates[, date := as.Date(date)] # Not sure whether this row speeds up or slows down
setkey(dates, date) # Not sure whether this row speeds up or slows down
# join with the dates table to get all dates within the sliding window
joined <-
dates[dtsubset, on = .(date >= SUBSCRIPTION_START, date <= SUBSCRIPTION_END)]
result <- nrow(joined)
return(result)
}
# apply the function to each row in the data table
dt[, historic_subscription_days := calc_cumulative_days(dt, SUBSCRIBER, SUBSCRIPTION_TYPE, SUBSCRIPTION_ISSUE), by = seq_len(nrow(dt))]
With outer
:
dt[
, historic_subscription_days := {
m <- outer(SUBSCRIPTION_END, SUBSCRIPTION_ISSUE - 1, pmin) -
outer(SUBSCRIPTION_START, SUBSCRIPTION_ISSUE - 1826, pmax) + 1
colSums(m*(m > 0))
}, SUBSCRIBER:SUBSCRIPTION_TYPE
]
dt
#> SUBSCRIBER SUBSCRIPTION_TYPE SUBSCRIPTION_ISSUE SUBSCRIPTION_START SUBSCRIPTION_END historic_subscription_days
#> 1: A X 2021-12-31 2022-01-01 2022-01-05 0
#> 2: A X 2022-01-02 2022-01-03 2022-01-07 1
#> 3: A X 2022-12-21 2023-01-01 2023-01-05 10
#> 4: A X 2023-01-01 2023-01-03 2023-01-07 10
#> 5: A Z 2025-01-01 2025-01-01 2025-01-05 0
#> 6: A Z 2025-01-03 2025-01-03 2025-01-07 2
#> 7: B X 2023-01-01 2023-01-03 2023-01-07 0
#> 8: B X 2025-01-01 2025-01-01 2025-01-05 5
#> 9: B X 2025-01-03 2025-01-03 2025-01-07 7
Benchmarking, the outer
solution is about 20 times faster on this dataset. As the dataset gets larger, the difference will become even more pronounced.
microbenchmark::microbenchmark(
historic_subscription_days = dt[, historic_subscription_days := calc_cumulative_days(dt, SUBSCRIBER, SUBSCRIPTION_TYPE, SUBSCRIPTION_ISSUE), by = seq_len(nrow(dt))],
outer = dt[
, historic_subscription_days := {
m <- outer(SUBSCRIPTION_END, SUBSCRIPTION_ISSUE - 1, pmin) -
outer(SUBSCRIPTION_START, SUBSCRIPTION_ISSUE - 1826, pmax) + 1
colSums(m*(m > 0))
}, SUBSCRIBER:SUBSCRIPTION_TYPE
],
check = "equal"
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> historic_subscription_days 17428.601 18243.751 20105.650 19611.80 21045.751 29570.101 100
#> outer 888.302 949.301 1094.788 1011.65 1101.802 3360.101 100