rperformance

how to quickly iterate functions one by one


i have some data

set.seed(1)
n <- 100
df <- data.frame(
  x = sample(1:30, n, replace = T),
  y = sample(1:30, n, replace = T),
  z = sample(1:30, n, replace = T)
)

And also i have list of functions, the number of functions can be any, the complexity of the functions too (I intentionally made them simple for this example)

rules <- list(function(i) df$x[i]==26,
              function(i) df$y[i]==17,
              function(i) df$z[i]==14)

Next, I have a function that sequentially searches for the triggering of the first function, then the second, and so on

# search sequence
# first x[i]==26 then y[i]==17 then z[i]==14
find_rules <- function(df, rules){
ln <- length(rules)
n <- 1
res <- matrix(0,nrow = ln, ncol = 2, dimnames = list(NULL, c("row","res")))
for(i in 1:nrow(df)){
  if(rules[[n]](i)){
    res[n,"row"] <- i
    res[n,"res"] <- 1
    n <- n+1
  }
  if(n==ln+1) break
}
return(res)
}

I would like to speed up find_rules function as much as possible without changing anything in the code presented above. I would also like complete identity in the calculations of your solution and find_rules on different seeds()


Solution

  • I tried to keep your code as much as possible but we can change the way of iterating, i.e., by rules

    find_rules_TIC <- function(df, rules) {
        res <- matrix(0, nrow = length(rules), ncol = 2, dimnames = list(NULL, c("row", "res")))
        rid <- seq_len(nrow(df))
        for (i in seq_along(rules)) {
            k <- which(rules[[i]](rid))
            res[i, "row"] <- k[k >= res[ifelse(i == 1, 1, i - 1), "row"]][1]
            res[i, "res"] <- 1
        }
        res
    }
    

    which should be faster than your original solution if the number of rules is less than the number of df rows.

    benchmark

    find_rules_mrT <- function(df, rules) {
        ln <- length(rules)
        n <- 1
        res <- matrix(0, nrow = ln, ncol = 2, dimnames = list(NULL, c("row", "res")))
        for (i in 1:nrow(df)) {
            if (rules[[n]](i)) {
                res[n, "row"] <- i
                res[n, "res"] <- 1
                n <- n + 1
            }
            if (n == ln + 1) break
        }
        return(res)
    }
    
    find_rules_TIC <- function(df, rules) {
        res <- matrix(0, nrow = length(rules), ncol = 2, dimnames = list(NULL, c("row", "res")))
        rid <- seq_len(nrow(df))
        for (i in seq_along(rules)) {
            k <- which(rules[[i]](rid))
            res[i, "row"] <- k[k >= res[ifelse(i == 1, 1, i - 1), "row"]][1]
            res[i, "res"] <- 1
        }
        res
    }
    
    find_rules_TIC2 <- function(df, rules) {
        rid <- seq_len(nrow(df))
        cbind(
            row = Reduce(\(a, b) b[b >= a][1],
                lapply(rules, \(f) which(f(rid))),
                init = -Inf,
                accumulate = TRUE
            )[-1],
            res = 1
        )
    }
    
    microbenchmark(
        find_rules_mrT(df, rules),
        find_rules_TIC(df, rules),
        find_rules_TIC2(df, rules),
        check = "equal",
        unit = "relative"
    )
    
    

    and we see that

    Unit: relative
                           expr      min       lq     mean   median       uq
      find_rules_mrT(df, rules) 3.825137 3.710938 3.420265 3.628788 3.462264
      find_rules_TIC(df, rules) 1.000000 1.000000 1.000000 1.000000 1.000000
     find_rules_TIC2(df, rules) 1.098361 1.093750 3.288799 1.093434 1.127358
          max neval
      2.89117   100
      1.00000   100
     99.19507   100