i have some data
set.seed(1)
n <- 100
df <- data.frame(
x = sample(1:30, n, replace = T),
y = sample(1:30, n, replace = T),
z = sample(1:30, n, replace = T)
)
And also i have list of functions, the number of functions can be any, the complexity of the functions too (I intentionally made them simple for this example)
rules <- list(function(i) df$x[i]==26,
function(i) df$y[i]==17,
function(i) df$z[i]==14)
Next, I have a function that sequentially searches for the triggering of the first function, then the second, and so on
# search sequence
# first x[i]==26 then y[i]==17 then z[i]==14
find_rules <- function(df, rules){
ln <- length(rules)
n <- 1
res <- matrix(0,nrow = ln, ncol = 2, dimnames = list(NULL, c("row","res")))
for(i in 1:nrow(df)){
if(rules[[n]](i)){
res[n,"row"] <- i
res[n,"res"] <- 1
n <- n+1
}
if(n==ln+1) break
}
return(res)
}
I would like to speed up find_rules
function as much as possible without changing anything in the code presented above.
I would also like complete identity in the calculations of your solution and find_rules
on different seeds()
I tried to keep your code as much as possible but we can change the way of iterating, i.e., by rules
find_rules_TIC <- function(df, rules) {
res <- matrix(0, nrow = length(rules), ncol = 2, dimnames = list(NULL, c("row", "res")))
rid <- seq_len(nrow(df))
for (i in seq_along(rules)) {
k <- which(rules[[i]](rid))
res[i, "row"] <- k[k >= res[ifelse(i == 1, 1, i - 1), "row"]][1]
res[i, "res"] <- 1
}
res
}
which should be faster than your original solution if the number of rules is less than the number of df
rows.
find_rules_mrT <- function(df, rules) {
ln <- length(rules)
n <- 1
res <- matrix(0, nrow = ln, ncol = 2, dimnames = list(NULL, c("row", "res")))
for (i in 1:nrow(df)) {
if (rules[[n]](i)) {
res[n, "row"] <- i
res[n, "res"] <- 1
n <- n + 1
}
if (n == ln + 1) break
}
return(res)
}
find_rules_TIC <- function(df, rules) {
res <- matrix(0, nrow = length(rules), ncol = 2, dimnames = list(NULL, c("row", "res")))
rid <- seq_len(nrow(df))
for (i in seq_along(rules)) {
k <- which(rules[[i]](rid))
res[i, "row"] <- k[k >= res[ifelse(i == 1, 1, i - 1), "row"]][1]
res[i, "res"] <- 1
}
res
}
find_rules_TIC2 <- function(df, rules) {
rid <- seq_len(nrow(df))
cbind(
row = Reduce(\(a, b) b[b >= a][1],
lapply(rules, \(f) which(f(rid))),
init = -Inf,
accumulate = TRUE
)[-1],
res = 1
)
}
microbenchmark(
find_rules_mrT(df, rules),
find_rules_TIC(df, rules),
find_rules_TIC2(df, rules),
check = "equal",
unit = "relative"
)
and we see that
Unit: relative
expr min lq mean median uq
find_rules_mrT(df, rules) 3.825137 3.710938 3.420265 3.628788 3.462264
find_rules_TIC(df, rules) 1.000000 1.000000 1.000000 1.000000 1.000000
find_rules_TIC2(df, rules) 1.098361 1.093750 3.288799 1.093434 1.127358
max neval
2.89117 100
1.00000 100
99.19507 100