rperformance

Efficiently reconstruct state variable from event times in R


I have a discrete-time stochastic model where individuals enter in state 0, can step up/down in their state value, and then exit. The numbers of individuals and time-steps are very large so I only save the times of entry, state-transitions (up, down), and exit. From these logs, I want to reconstruct the mean state value among active individuals (after entry & before exit) for all time.

Problem / Question: The solution below is not very fast (especially for large ni and tf). Can we obtain val.t / act.t more efficiently? Ideally avoiding the loop, but I'm not sure if that's possible due to the ragged nature of to.i and tx.i (each individual experiences a random number of events). I know we can speed-up with parallel:: but I'm already using all CPUs with model instances in parallel.

MWE

# config
set.seed(1)
ni = 1000    # num individuals (i)
ne = 100     # mean num state change events per i
tf = 365*100 # total timesteps
# generate event times (dummy model)
t0.i = runif(ni,1,tf/2)   # model entry time per i
tf.i = t0.i + tf/2        # model exit time per i
ne.i = rpois(ni,ne) + 1   # num events per i
to.i = lapply(1:ni,function(i){ round(runif(ne.i[i],t0.i[i],tf.i[i])) }) # state up times
tx.i = lapply(1:ni,function(i){ to.i[[i]] + rpois(ne.i[i],365) })        # state down times
# mean value among active
act.t = numeric(tf) # will hold total num active
val.t = numeric(tf) # will hold total state value among active
for (i in 1:ni){        # for each i
  ti = t0.i[i]:tf.i[i]  # timesteps active
  val.i = cumsum(tabulate(to.i[[i]],tf)-tabulate(tx.i[[i]],tf)) # state value
  act.t[ti] = act.t[ti] + 1         # add to active
  val.t[ti] = val.t[ti] + val.i[ti] # add to state value
}
# plot
plot(val.t/act.t,type='l') # mean value among active
lines(act.t/ni,col='red') # proportion active

Plot (FYI)

output


Solution

  • Below is a fully vectorized base R approach. It provides a ~75x speedup.

    # config
    set.seed(1)
    ni <- 1000    # num individuals (i)
    ne <- 100     # mean num state change events per i
    tf <- 365*100 # total timesteps
    t0.i <- runif(ni,1,tf/2)   # model entry time per i
    tf.i <- t0.i + tf/2        # model exit time per i
    ne.i <- rpois(ni,ne) + 1   # num events per i
    # generate event times (dummy model)
    ne.tot <- sum(ne.i)
    to.i <- round(runif(ne.tot, rep.int(t0.i, ne.i), rep.int(tf.i, ne.i)))
    tx.i <- pmin(to.i + rpois(ne.tot, 365), rep.int(tf.i + 1, ne.i))
    act.t <- cumsum(tabulate(t0.i, tf) - tabulate(tf.i + 1, tf))
    val.t <- cumsum(tabulate(to.i, tf) - tabulate(tx.i, tf))
    

    Benchmarking:

    f1 <- function() {
      # config
      set.seed(1)
      ni = 1000    # num individuals (i)
      ne = 100     # mean num state change events per i
      tf = 365*100 # total timesteps
      # generate event times (dummy model)
      t0.i = runif(ni,1,tf/2)   # model entry time per i
      tf.i = t0.i + tf/2        # model exit time per i
      ne.i = rpois(ni,ne) + 1   # num events per i
      to.i = lapply(1:ni,function(i){ round(runif(ne.i[i],t0.i[i],tf.i[i])) }) # state up times
      tx.i = lapply(1:ni,function(i){ to.i[[i]] + rpois(ne.i[i],365) })        # state down times
      # mean value among active
      act.t = numeric(tf) # will hold total num active
      val.t = numeric(tf) # will hold total state value among active
      for (i in 1:ni){        # for each i
        ti = t0.i[i]:tf.i[i]  # timesteps active
        val.i = cumsum(tabulate(to.i[[i]],tf)-tabulate(tx.i[[i]],tf)) # state value
        act.t[ti] = act.t[ti] + 1         # add to active
        val.t[ti] = val.t[ti] + val.i[ti] # add to state value
      }
      list(act.t = act.t, val.t = val.t)
    }
    
    f2 <- function() {
      # config
      set.seed(1)
      ni <- 1000    # num individuals (i)
      ne <- 100     # mean num state change events per i
      tf <- 365*100 # total timesteps
      t0.i <- runif(ni,1,tf/2)   # model entry time per i
      tf.i <- t0.i + tf/2        # model exit time per i
      ne.i <- rpois(ni,ne) + 1   # num events per i
      # generate event times (dummy model)
      ne.tot <- sum(ne.i)
      to.i <- round(runif(ne.tot, rep.int(t0.i, ne.i), rep.int(tf.i, ne.i)))
      tx.i <- pmin(to.i + rpois(ne.tot, 365), rep.int(tf.i + 1, ne.i))
        list(act.t = cumsum(tabulate(t0.i, tf) - tabulate(tf.i + 1, tf)),
             val.t = cumsum(tabulate(to.i, tf) - tabulate(tx.i, tf)))
    }
    

    Result:

    microbenchmark::microbenchmark(
      f1 = f1(),
      f2 = f2(),
      times = 10,
      check = "equal",
      unit = "relative"
    )
    #> Unit: relative
    #>  expr      min       lq     mean   median       uq      max neval cld
    #>    f1 82.23186 82.00272 78.94775 91.66146 91.71763 47.53844    10  a 
    #>    f2  1.00000  1.00000  1.00000  1.00000  1.00000  1.00000    10   b