rdataframeperformancesampling

Improve the performance of recursive sampling function


As a follow-up to my previous question, I'm interested in improving the performance of the existing recursive sampling function.

By recursive sampling I mean randomly choosing up to n unique unexposed IDs for a given exposed ID, and then randomly choosing up to n unique unexposed IDs from the remaining unexposed IDs for another exposed ID. If there are no remaining unexposed IDs for a given exposed ID, then the exposed ID is left out.

The original function is as follows:

recursive_sample <- function(data, n) {
 
 groups <- unique(data[["exposed"]])
 out <- data.frame(exposed = character(), unexposed = character())
 
 for (group in groups) {
  
  chosen <- data %>%
   filter(exposed == group,
          !unexposed %in% out$unexposed) %>%
   group_by(unexposed) %>%
   slice(1) %>%
   ungroup() %>%
   sample_n(size = min(n, nrow(.))) 
  
  out <- rbind(out, chosen)
  
 }
 
 out
 
}

I was able to create a more efficient one as follows:

recursive_sample2 <- function(data, n) {
 
 groups <- unique(data[["exposed"]])
 out <- tibble(exposed = integer(), unexposed = integer())
 
 for (group in groups) {
  
  chosen <- data %>%
   filter(exposed == group,
          !unexposed %in% out$unexposed) %>%
   filter(!duplicated(unexposed)) %>%
   sample_n(size = min(n, nrow(.))) 
  
  out <- bind_rows(out, chosen)
  
 }
 
 out
 
}

Sample data and bechmarking:

set.seed(123)
df <- tibble(exposed = rep(1:100, each = 100),
             unexposed = sample(1:7000, 10000, replace = TRUE))

microbenchmark(f1 = recursive_sample(df, 5),
               f2 = recursive_sample2(df, 5),
               times = 10)

Unit: milliseconds
 expr       min        lq      mean    median        uq      max neval cld
   f1 1307.7198 1316.5276 1379.0533 1371.3952 1416.6360 1540.955    10   b
   f2  839.0086  865.2547  914.8327  901.2288  970.9518 1036.170    10  a 

However, for my actual dataset, I would need an even more efficient (i.e., quicker) function. Any ideas for a more efficient version, whether in data.table, involving parallelisation or other approaches are welcome.

December 2023 edit

The versions from @minem and @ThomasIsCoding are substantially faster; however, they return only partially correct results. Considering sample data such as:

df <- structure(list(exposed = c(4L, 4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L, 
                                 2L, 2L, 2L), unexposed = c(1L, 2L, 2L, 1L, 2L, 3L, 10L, 4L, 5L, 
                                                            7L, 8L, 9L), rowid = 1:12), class = "data.frame", row.names = c("1", 
                                                                                                                            "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

   exposed unexposed rowid
1        4         1     1
2        4         2     2
3        4         2     3
4        1         1     4
5        1         2     5
6        1         3     6
7        3        10     7
8        2         4     8
9        2         5     9
10       2         7    10
11       2         8    11
12       2         9    12

I would expect exposed == 4 to be sampled twice, exposed == 1 to be sampled once, exposed == 3 to be sampled once, and exposed == 2 to be sampled twice. In other words, the sampling procedure should reflect the provided order of data. Desired output:

exposed unexposed rowid
1        4         1     1
2        4         2     2
6        1         3     6
7        3        10     7
8        2         4     8
11       2         8    11

Solution

  • Updated 12/13/23

    A data.table solution that keeps a running list of sampled values and uses %!in% from collapse to prevent them from being sampled again:

    library(data.table)
    library(collapse) # for %!in%
    
    fjblood1 <- function(data, n) {
      setDT(data)
      # randomly select unique exposed/unexposed rows
      u <- 1:nrow(data)
      if (anyDuplicated(data, by = c("exposed", "unexposed"))) {
        u <- u[
          data[,i := .I][
            sample(.N), -i[duplicated(.SD)], .SDcols = c("exposed", "unexposed")
          ]
        ]
        data[,i := NULL]
      } 
      sampled <- vector(mode(data$unexposed), length(u))
      k <- 0L
      data[
        u, .(
          unexposed = {
            x <- unexposed[unexposed %!in% sampled[seq_len(k)]]
            x <- x[sample.int(length(x), min(length(x), n))]
            sampled[seq.int(k + 1L, along.with = x)] <- x
            k <- k + length(x)
            x
          }
        ), exposed
      ]
    }
    

    Alternatively, if df has additional columns that you want to preserve:

    fjblood2 <- function(data, n) {
      setDT(data)
      u <- 1:nrow(data)
      if (anyDuplicated(data, by = c("exposed", "unexposed"))) {
        u <- u[
          data[,i := .I][
            sample(.N), -i[duplicated(.SD)], .SDcols = c("exposed", "unexposed")
          ]
        ]
        data[,i := NULL]
      } 
      sampled <- vector(mode(data$unexposed), length(u))
      k <- 0L
      data[
        u, {
          i <- which(unexposed %!in% sampled[seq_len(k)])
          i <- i[sample.int(length(i), min(length(i), n))]
          sampled[seq.int(k + 1L, along.with = i)] <- unexposed[i]
          k <- k + length(i)
          .SD[i]
        }, exposed
      ]
    }
    

    Other candidate functions:

    ftmfmnk <- function(data, n) {
      groups <- unique(data[["exposed"]])
      out <- tibble(exposed = integer(), unexposed = integer())
      
      for (group in groups) {
        chosen <- data %>%
          filter(
            exposed == group,
            !unexposed %in% out$unexposed
          ) %>%
          filter(!duplicated(unexposed)) %>%
          sample_n(size = min(n, nrow(.)))
        
        out <- bind_rows(out, chosen)
      }
      
      out
    }
    
    fminem <- function(data, n) {
      groups <- unique(data[["exposed"]])
      # working on vectors is faster
      id <- 1:nrow(data)
      i <- vector("integer")
      unexposed2 <- vector(class(data$unexposed))
      ex <- data$exposed
      ux <- data$unexposed
      
      for (group in groups) {
        f1 <- ex == group # first filter
        f2 <- !ux[f1] %in% unexposed2 # 2nd filter (only on those that match 1st)
        id3 <- id[f1][f2][!duplicated(ux[f1][f2])] # check duplicates only on needed
        # and select necesary row ids
        is <- sample(id3, size = min(length(id3), n)) # sample row ids
        i <- c(i, is) # add to list
        unexposed2 <- ux[i] # resave unexposed2
      }
      out <- data[i, ] # only one data.frame subset
      out$id <- NULL
      out
    }
    
    ftic <- function(df, n) {
      lst <- split(df, ~exposed)[as.character(unique(df$exposed))]
      pool <- c()
      for (k in seq_along(lst)) {
        d <- lst[[k]]
        dd <- subset(d[sample.int(nrow(d)), ], !duplicated(unexposed) & !unexposed %in% pool)
        lst[[k]] <- head(dd, n)
        pool <- c(pool, dd$unexposed)
      }
      `rownames<-`(do.call(rbind, lst), NULL)
    }
    
    `%nin%` <- function (x, table) {
      match(x, table, nomatch = 0L) == 0L
    }
    
    fmnist_fast <- function(dat, n) {
      # make a named vector
      vec <- setNames(dat$unexposed, dat$exposed)
      
      # Randomly shuffle the groups
      ord <- unique(names(vec))
      
      # random order the vector
      vec <- sample(vec, size = length(vec))
      
      # keep the names for sampling and later
      nms <- names(vec)
      
      
      # note that there is no need to sample again since both the order 
      # of the groups as well as the order within the groups (vec) was sampled.
      
      # by setting an initial value, we avoid an exception in the first iteration
      init <- head(unique(vec[nms == ord[1]]), n)
      
      # `unique()` drops the names 
      # yet names are needed to retain the initial data structure
      names(init) <- rep(ord[1], length(init))
      
      res <- Reduce(
        \(x, y) {
          # current subgroup
          vec_sub <- vec[nms == y]
          
          # without unique, duplicated values inside of the group 
          # can be selected multiple times
          res_vec <- head(unique(vec_sub[vec_sub %nin% x]), n)
          
          # in case all values of the current group are already sampled
          len_vec <- length(res_vec)
          if (len_vec == 0) {
            return(x)
          }
          
          # `unique()` drops the names 
          # yet names are needed to retain the initial data structure
          names(res_vec) <- rep(y, len_vec)
          
          # concate
          return(c(x, res_vec))
        },
        # first group is already in init
        ord[-1], 
        # first group, randomly sampled (see above)
        init = init
      )
      
      # back to a data.frame
      res_dat <- data.frame(exposed   = names(res),
                            unexposed = res)
      return(res_dat)
    }
    

    Compare the results from the dataset in the question (with two additional columns to demonstrate fjblood2):

    df <- data.frame(exposed = c(4L, 4L, 4L, 1L, 1L, 1L, 3L, 2L, 2L, 2L, 2L, 2L),
                     unexposed = c(1L, 2L, 2L, 1L, 2L, 3L, 10L, 4L, 5L, 7L, 8L, 9L),
                     rowid = 1:12)
    
    dt <- as.data.table(df)
    fjblood1(dt, 2L)
    #>    exposed unexposed
    #> 1:       4         1
    #> 2:       4         2
    #> 3:       1         3
    #> 4:       3        10
    #> 5:       2         4
    #> 6:       2         7
    fjblood2(dt, 2L)
    #>    exposed unexposed rowid
    #> 1:       4         2     3
    #> 2:       4         1     1
    #> 3:       1         3     6
    #> 4:       3        10     7
    #> 5:       2         8    11
    #> 6:       2         9    12
    ftmfmnk(df, 2L)
    #> # A tibble: 6 × 3
    #>   exposed unexposed rowid
    #>     <int>     <int> <int>
    #> 1       4         1     1
    #> 2       4         2     2
    #> 3       1         3     6
    #> 4       3        10     7
    #> 5       2         4     8
    #> 6       2         9    12
    fminem(df, 2L)
    #>    exposed unexposed rowid
    #> 1        4         1     1
    #> 2        4         2     2
    #> 3        4         2     3
    #> 5        1         2     5
    #> 10       2         7    10
    #> 9        2         5     9
    fmnist_fast(df, 2L)
    #>   exposed unexposed
    #> 1       4         2
    #> 2       4         1
    #> 3       1         3
    #> 4       3        10
    #> 5       2         5
    #> 6       2         8
    ftic(df, 2L)
    #>   exposed unexposed rowid
    #> 1       4         2     3
    #> 2       4         1     1
    #> 3       1         3     6
    #> 4       3        10     7
    #> 5       2         7    10
    #> 6       2         5     9
    

    As pointed out in the updated question, fminem does not behave as desired.

    Benchmarking with a larger dataset:

    set.seed(123)
    df <- tibble(
      exposed = rep(1:1000, each = 1000),
      unexposed = sample(7e4, 1e6, 1),
      data1 = runif(1e6),
      data2 = sample(LETTERS, 1e6, 1)
    )
    dt <- as.data.table(df)
    
    microbenchmark::microbenchmark(
      ftmfmnk = ftmfmnk(df, 100L),
      fminem = fminem(df, 100L),
      ftic = ftic(df, 100L),
      fmnist_fast(df, 100L),
      fjblood1 = fjblood1(dt, 100L),
      fjblood2 = fjblood2(dt, 100L),
      times = 1
    )
    #> Unit: milliseconds
    #>                   expr        min         lq       mean     median         uq        max neval
    #>                ftmfmnk 74362.9330 74362.9330 74362.9330 74362.9330 74362.9330 74362.9330     1
    #>                 fminem  5675.6071  5675.6071  5675.6071  5675.6071  5675.6071  5675.6071     1
    #>                   ftic  4624.8753  4624.8753  4624.8753  4624.8753  4624.8753  4624.8753     1
    #>  fmnist_fast(df, 100L) 16919.1535 16919.1535 16919.1535 16919.1535 16919.1535 16919.1535     1
    #>               fjblood1   943.9751   943.9751   943.9751   943.9751   943.9751   943.9751     1
    #>               fjblood2   909.6813   909.6813   909.6813   909.6813   909.6813   909.6813     1
    

    Another benchmark with more replications and excluding ftmfmnk:

    microbenchmark::microbenchmark(
      # ftmfmnk = ftmfmnk(df, 100L),
      fminem = fminem(df, 100L),
      ftic = ftic(df, 100L),
      fmnist_fast(df, 100L),
      fjblood1 = fjblood1(dt, 100L),
      fjblood2 = fjblood2(dt, 100L),
      times = 10
    )
    #> Unit: milliseconds
    #>                   expr        min         lq       mean    median         uq        max neval
    #>                 fminem  5518.5713  5550.8040  5614.1822  5600.498  5615.9687  5788.3764    10
    #>                   ftic  4370.4639  4412.8366  4467.5108  4434.879  4558.6254  4607.8332    10
    #>  fmnist_fast(df, 100L) 16483.5461 16524.7059 16639.4094 16602.269 16708.1953 17001.8668    10
    #>               fjblood1   618.8821   628.6334   644.4724   636.097   646.6322   693.7889    10
    #>               fjblood2   707.1110   730.6251   756.7017   754.079   788.2097   814.0123    10