As a follow-up to my previous question, I'm interested in improving the performance of the existing recursive sampling function.
By recursive sampling I mean randomly choosing up to n unique unexposed IDs for a given exposed ID, and then randomly choosing up to n unique unexposed IDs from the remaining unexposed IDs for another exposed ID. If there are no remaining unexposed IDs for a given exposed ID, then the exposed ID is left out.
The original function is as follows:
recursive_sample <- function(data, n) {
groups <- unique(data[["exposed"]])
out <- data.frame(exposed = character(), unexposed = character())
for (group in groups) {
chosen <- data %>%
filter(exposed == group,
!unexposed %in% out$unexposed) %>%
group_by(unexposed) %>%
slice(1) %>%
ungroup() %>%
sample_n(size = min(n, nrow(.)))
out <- rbind(out, chosen)
}
out
}
I was able to create a more efficient one as follows:
recursive_sample2 <- function(data, n) {
groups <- unique(data[["exposed"]])
out <- tibble(exposed = integer(), unexposed = integer())
for (group in groups) {
chosen <- data %>%
filter(exposed == group,
!unexposed %in% out$unexposed) %>%
filter(!duplicated(unexposed)) %>%
sample_n(size = min(n, nrow(.)))
out <- bind_rows(out, chosen)
}
out
}
Sample data and bechmarking:
set.seed(123)
df <- tibble(exposed = rep(1:100, each = 100),
unexposed = sample(1:7000, 10000, replace = TRUE))
microbenchmark(f1 = recursive_sample(df, 5),
f2 = recursive_sample2(df, 5),
times = 10)
Unit: milliseconds
expr min lq mean median uq max neval cld
f1 1307.7198 1316.5276 1379.0533 1371.3952 1416.6360 1540.955 10 b
f2 839.0086 865.2547 914.8327 901.2288 970.9518 1036.170 10 a
However, for my actual dataset, I would need an even more efficient (i.e., quicker) function. Any ideas for a more efficient version, whether in data.table
, involving parallelisation or other approaches are welcome.
December 2023 edit
The versions from @minem and @ThomasIsCoding are substantially faster; however, they return only partially correct results. Considering sample data such as:
df <- structure(list(exposed = c(4L, 4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L,
2L, 2L, 2L), unexposed = c(1L, 2L, 2L, 1L, 2L, 3L, 10L, 4L, 5L,
7L, 8L, 9L), rowid = 1:12), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
exposed unexposed rowid
1 4 1 1
2 4 2 2
3 4 2 3
4 1 1 4
5 1 2 5
6 1 3 6
7 3 10 7
8 2 4 8
9 2 5 9
10 2 7 10
11 2 8 11
12 2 9 12
I would expect exposed == 4 to be sampled twice, exposed == 1 to be sampled once, exposed == 3 to be sampled once, and exposed == 2 to be sampled twice. In other words, the sampling procedure should reflect the provided order of data. Desired output:
exposed unexposed rowid
1 4 1 1
2 4 2 2
6 1 3 6
7 3 10 7
8 2 4 8
11 2 8 11
A data.table
solution that keeps a running list of sampled values and uses %!in%
from collapse
to prevent them from being sampled again:
library(data.table)
library(collapse) # for %!in%
fjblood1 <- function(data, n) {
setDT(data)
# randomly select unique exposed/unexposed rows
u <- 1:nrow(data)
if (anyDuplicated(data, by = c("exposed", "unexposed"))) {
u <- u[
data[,i := .I][
sample(.N), -i[duplicated(.SD)], .SDcols = c("exposed", "unexposed")
]
]
data[,i := NULL]
}
sampled <- vector(mode(data$unexposed), length(u))
k <- 0L
data[
u, .(
unexposed = {
x <- unexposed[unexposed %!in% sampled[seq_len(k)]]
x <- x[sample.int(length(x), min(length(x), n))]
sampled[seq.int(k + 1L, along.with = x)] <- x
k <- k + length(x)
x
}
), exposed
]
}
Alternatively, if df
has additional columns that you want to preserve:
fjblood2 <- function(data, n) {
setDT(data)
u <- 1:nrow(data)
if (anyDuplicated(data, by = c("exposed", "unexposed"))) {
u <- u[
data[,i := .I][
sample(.N), -i[duplicated(.SD)], .SDcols = c("exposed", "unexposed")
]
]
data[,i := NULL]
}
sampled <- vector(mode(data$unexposed), length(u))
k <- 0L
data[
u, {
i <- which(unexposed %!in% sampled[seq_len(k)])
i <- i[sample.int(length(i), min(length(i), n))]
sampled[seq.int(k + 1L, along.with = i)] <- unexposed[i]
k <- k + length(i)
.SD[i]
}, exposed
]
}
Other candidate functions:
ftmfmnk <- function(data, n) {
groups <- unique(data[["exposed"]])
out <- tibble(exposed = integer(), unexposed = integer())
for (group in groups) {
chosen <- data %>%
filter(
exposed == group,
!unexposed %in% out$unexposed
) %>%
filter(!duplicated(unexposed)) %>%
sample_n(size = min(n, nrow(.)))
out <- bind_rows(out, chosen)
}
out
}
fminem <- function(data, n) {
groups <- unique(data[["exposed"]])
# working on vectors is faster
id <- 1:nrow(data)
i <- vector("integer")
unexposed2 <- vector(class(data$unexposed))
ex <- data$exposed
ux <- data$unexposed
for (group in groups) {
f1 <- ex == group # first filter
f2 <- !ux[f1] %in% unexposed2 # 2nd filter (only on those that match 1st)
id3 <- id[f1][f2][!duplicated(ux[f1][f2])] # check duplicates only on needed
# and select necesary row ids
is <- sample(id3, size = min(length(id3), n)) # sample row ids
i <- c(i, is) # add to list
unexposed2 <- ux[i] # resave unexposed2
}
out <- data[i, ] # only one data.frame subset
out$id <- NULL
out
}
ftic <- function(df, n) {
lst <- split(df, ~exposed)[as.character(unique(df$exposed))]
pool <- c()
for (k in seq_along(lst)) {
d <- lst[[k]]
dd <- subset(d[sample.int(nrow(d)), ], !duplicated(unexposed) & !unexposed %in% pool)
lst[[k]] <- head(dd, n)
pool <- c(pool, dd$unexposed)
}
`rownames<-`(do.call(rbind, lst), NULL)
}
`%nin%` <- function (x, table) {
match(x, table, nomatch = 0L) == 0L
}
fmnist_fast <- function(dat, n) {
# make a named vector
vec <- setNames(dat$unexposed, dat$exposed)
# Randomly shuffle the groups
ord <- unique(names(vec))
# random order the vector
vec <- sample(vec, size = length(vec))
# keep the names for sampling and later
nms <- names(vec)
# note that there is no need to sample again since both the order
# of the groups as well as the order within the groups (vec) was sampled.
# by setting an initial value, we avoid an exception in the first iteration
init <- head(unique(vec[nms == ord[1]]), n)
# `unique()` drops the names
# yet names are needed to retain the initial data structure
names(init) <- rep(ord[1], length(init))
res <- Reduce(
\(x, y) {
# current subgroup
vec_sub <- vec[nms == y]
# without unique, duplicated values inside of the group
# can be selected multiple times
res_vec <- head(unique(vec_sub[vec_sub %nin% x]), n)
# in case all values of the current group are already sampled
len_vec <- length(res_vec)
if (len_vec == 0) {
return(x)
}
# `unique()` drops the names
# yet names are needed to retain the initial data structure
names(res_vec) <- rep(y, len_vec)
# concate
return(c(x, res_vec))
},
# first group is already in init
ord[-1],
# first group, randomly sampled (see above)
init = init
)
# back to a data.frame
res_dat <- data.frame(exposed = names(res),
unexposed = res)
return(res_dat)
}
Compare the results from the dataset in the question (with two additional columns to demonstrate fjblood2
):
df <- data.frame(exposed = c(4L, 4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L, 2L, 2L, 2L),
unexposed = c(1L, 2L, 2L, 1L, 2L, 3L, 10L, 4L, 5L, 7L, 8L, 9L),
rowid = 1:12)
dt <- as.data.table(df)
fjblood1(dt, 2L)
#> exposed unexposed
#> 1: 4 1
#> 2: 4 2
#> 3: 1 3
#> 4: 3 10
#> 5: 2 4
#> 6: 2 7
fjblood2(dt, 2L)
#> exposed unexposed rowid
#> 1: 4 2 3
#> 2: 4 1 1
#> 3: 1 3 6
#> 4: 3 10 7
#> 5: 2 8 11
#> 6: 2 9 12
ftmfmnk(df, 2L)
#> # A tibble: 6 × 3
#> exposed unexposed rowid
#> <int> <int> <int>
#> 1 4 1 1
#> 2 4 2 2
#> 3 1 3 6
#> 4 3 10 7
#> 5 2 4 8
#> 6 2 9 12
fminem(df, 2L)
#> exposed unexposed rowid
#> 1 4 1 1
#> 2 4 2 2
#> 3 4 2 3
#> 5 1 2 5
#> 10 2 7 10
#> 9 2 5 9
fmnist_fast(df, 2L)
#> exposed unexposed
#> 1 4 2
#> 2 4 1
#> 3 1 3
#> 4 3 10
#> 5 2 5
#> 6 2 8
ftic(df, 2L)
#> exposed unexposed rowid
#> 1 4 2 3
#> 2 4 1 1
#> 3 1 3 6
#> 4 3 10 7
#> 5 2 7 10
#> 6 2 5 9
As pointed out in the updated question, fminem
does not behave as desired.
Benchmarking with a larger dataset:
set.seed(123)
df <- tibble(
exposed = rep(1:1000, each = 1000),
unexposed = sample(7e4, 1e6, 1),
data1 = runif(1e6),
data2 = sample(LETTERS, 1e6, 1)
)
dt <- as.data.table(df)
microbenchmark::microbenchmark(
ftmfmnk = ftmfmnk(df, 100L),
fminem = fminem(df, 100L),
ftic = ftic(df, 100L),
fmnist_fast(df, 100L),
fjblood1 = fjblood1(dt, 100L),
fjblood2 = fjblood2(dt, 100L),
times = 1
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> ftmfmnk 74362.9330 74362.9330 74362.9330 74362.9330 74362.9330 74362.9330 1
#> fminem 5675.6071 5675.6071 5675.6071 5675.6071 5675.6071 5675.6071 1
#> ftic 4624.8753 4624.8753 4624.8753 4624.8753 4624.8753 4624.8753 1
#> fmnist_fast(df, 100L) 16919.1535 16919.1535 16919.1535 16919.1535 16919.1535 16919.1535 1
#> fjblood1 943.9751 943.9751 943.9751 943.9751 943.9751 943.9751 1
#> fjblood2 909.6813 909.6813 909.6813 909.6813 909.6813 909.6813 1
Another benchmark with more replications and excluding ftmfmnk
:
microbenchmark::microbenchmark(
# ftmfmnk = ftmfmnk(df, 100L),
fminem = fminem(df, 100L),
ftic = ftic(df, 100L),
fmnist_fast(df, 100L),
fjblood1 = fjblood1(dt, 100L),
fjblood2 = fjblood2(dt, 100L),
times = 10
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> fminem 5518.5713 5550.8040 5614.1822 5600.498 5615.9687 5788.3764 10
#> ftic 4370.4639 4412.8366 4467.5108 4434.879 4558.6254 4607.8332 10
#> fmnist_fast(df, 100L) 16483.5461 16524.7059 16639.4094 16602.269 16708.1953 17001.8668 10
#> fjblood1 618.8821 628.6334 644.4724 636.097 646.6322 693.7889 10
#> fjblood2 707.1110 730.6251 756.7017 754.079 788.2097 814.0123 10