rfor-loopmatrixindexingvectorization

How to remove and speed up a for loop over matrix columns by vectorisation?


Function

# Reverse wins and losses in matrix m
# by swapping opponents within columns.
t.sparse <- function(m, opp) {
  for (k in seq_len(ncol(m))) {
    m[, k] <- m[, k][opp[, k]]
  }
  return(m)
}

Data

# Indexed by player, round.
# First round: Player 1 meets player 4, player 2 meets player 3.
opponents <- matrix(c(4, 2, NA,
                      3, 1, 4,
                      2, 4, NA,
                      1, 3, 2
                     ), nrow = 4, byrow = TRUE
                   )
# Player 1 draws player 4, player 2 beats player 3 in first round
results   <- matrix(c(1, 2, NA, 
                      2, 0, 3,
                      0, 2, NA,
                      1, 0, 1
                     ), nrow = 4, byrow = TRUE
                    )

# In the results matrix, outcomes between two players are reversed.
t.sparse(results, opponents)
# Gives:
#      [,1] [,2] [,3]
# [1,]    1    0   NA
# [2,]    0    2    3
# [3,]    2    0   NA
# [4,]    1    2    1

The matrices are modest in size: 1300 x 10 (max), 125 x 12, 26 x 26

#Githubs copilot suggests

tv.sparse <- function(m, opp) {
  m <- m[cbind(seq_along(m), opp)]
  return(m)
}

This is obviously not equivalent to the original function.

Q: How to do this in a proper way?

Benchmark

tv.sparse <-  function(m, opp) {
  res <- matrix(NA, nrow = nrow(m), ncol = ncol(m))
  col_ids <- matrix(rep(seq_len(ncol(m)), each = nrow(m)), nrow = nrow(m)) 
  valid <- !is.na(opp)
  res[valid] <- m[cbind(opp[valid], col_ids[valid])]
  return(res)
}

library(bench)
bench::mark(
  "forloop" = ttt <-  t.sparse(results, opponents),
  "vect1"   = vv1 <- tv.sparse(results, opponents),
  check=TRUE
)

## A tibble: 2 × 13
#  expression      min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr> <bch:tm> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 forloop       4.6µs  5.4µs   179329.        0B     89.7  9995     5     55.7ms
#2 vect1         8.5µs  9.6µs   101872.    59.8KB     91.8  9991     9     98.1ms
## ℹ 4 more variables: result <list>, memory <list>, time <list>, gc <list>

identical(ttt, vv1)
# [1] TRUE


Solution

  • It looks like you're extracting, for each player and round, the result of their opponent in that same round as stored in the opponents matrix. This can be vectorised as follows:

    get_output <- function(opponents, results) {
        indices <- matrix(c(opponents, col(opponents)), ncol = 2)
        matrix(
            results[indices],
            ncol = ncol(opponents)
        )
    }
    

    We can use the opponents matrix to select the desired row and the base col() function to represent the appropriate column. Each rows of the indices matrix represents the desired row and column, e.g. row 1 is:

    #       [,1] [,2]
    #  [1,]    4    1
    

    This means that the first value of the output will be results[4, 1], i.e. the result for player four (the first opponent of player one) in the first round. This returns the desired results:

    identical(
        get_output(opponents, results),
        t.sparse(results, opponents)
    )
    # [1] TRUE
    

    Benchmark

    Here is a benchmark with 10 to 100,000 opponents and 10 to 110 rounds:

    out <- bench::press(
        n_players = 10^c(1:5),
        n_rounds = seq(from = 10, to = 110, by = 25),
        {
            opponents <- sample(n_rounds, size = n_rounds * n_players, replace = TRUE) |>
                matrix(ncol = n_players)
            results <- sample(c(0, 1, 2), n_rounds * n_players, replace = TRUE) |>
                matrix(ncol = n_players)
            bench::mark(
                t.sparse(results, opponents),
                get_output(opponents, results)
            )
        }
    )
    

    This is a quicker than your approach. Note that times are on a log scale, so for example with 100k opponents and 110 rounds the median time for this approach is 143ms compared with 997ms using a loop:

    benchmark results

    Full benchmark output:

    # A tibble: 50 × 9
       expression                     n_players n_rounds      min   median `itr/sec` mem_alloc `gc/sec` n_itr
       <bch:expr>                         <dbl>    <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int>
     1 t.sparse(results, opponents)          10       10     17µs   20.4µs  45816.        848B    18.3   9996
     2 get_output(opponents, results)        10       10   9.43µs  10.29µs  89950.      4.19KB     9.00  9999
     3 t.sparse(results, opponents)         100       10 145.46µs 187.75µs   5149.      7.86KB    17.0   2425
     4 get_output(opponents, results)       100       10  24.11µs  26.21µs  34482.     39.34KB     3.45  9999
     5 t.sparse(results, opponents)        1000       10 789.04µs  829.1µs    893.     78.17KB    28.5    376
     6 get_output(opponents, results)      1000       10  78.91µs  81.71µs  11930.    390.91KB    16.7   5726
     7 t.sparse(results, opponents)       10000       10   8.04ms   8.13ms    122.     781.3KB    56.3     39
     8 get_output(opponents, results)     10000       10 810.83µs 851.79µs   1166.      3.81MB    18.9    555
     9 t.sparse(results, opponents)      100000       10  85.62ms  86.26ms     11.6     7.63MB    36.6      6
    10 get_output(opponents, results)    100000       10   7.97ms    9.4ms    100.     38.15MB    21.6     51
    11 t.sparse(results, opponents)          10       35  12.22µs  13.24µs  71713.     16.69KB    21.5   9997
    12 get_output(opponents, results)        10       35   6.37µs   6.95µs 139477.     13.95KB    13.9   9999
    13 t.sparse(results, opponents)         100       35 109.97µs 115.56µs   8427.    166.45KB    26.7   3471
    14 get_output(opponents, results)       100       35  30.49µs  32.12µs  30153.       137KB    18.1   9994
    15 t.sparse(results, opponents)        1000       35   1.13ms   1.17ms    848.      1.62MB    28.0    333
    16 get_output(opponents, results)      1000       35 300.09µs 313.72µs   3168.      1.33MB    18.9   1511
    17 t.sparse(results, opponents)       10000       35  11.31ms  11.67ms     85.8    16.25MB   132.      13
    18 get_output(opponents, results)     10000       35   3.21ms   3.29ms    303.     13.35MB    36.1    126
    19 t.sparse(results, opponents)      100000       35 504.39ms 504.39ms      1.98  162.51MB     1.98     1
    20 get_output(opponents, results)    100000       35  40.63ms  42.72ms     20.7   133.51MB    26.3     11
    21 t.sparse(results, opponents)          10       60  14.93µs   16.2µs  59672.      26.3KB    11.9   9998
    22 get_output(opponents, results)        10       60   8.26µs   9.44µs 100450.     23.72KB    10.0   9999
    23 t.sparse(results, opponents)         100       60 133.79µs 142.07µs   6813.    262.55KB    11.9   2865
    24 get_output(opponents, results)       100       60  48.86µs  51.82µs  18339.    234.66KB    19.9   6449
    25 t.sparse(results, opponents)        1000       60   1.32ms   1.39ms    715.      2.56MB     6.99   307
    26 get_output(opponents, results)      1000       60 506.89µs 527.29µs   1707.      2.29MB    17.0    804
    27 t.sparse(results, opponents)       10000       60  13.42ms  13.66ms     72.9    25.64MB    10.4     28
    28 get_output(opponents, results)     10000       60   5.51ms   5.72ms    172.     22.89MB    34.4     65
    29 t.sparse(results, opponents)      100000       60 175.32ms  290.8ms      3.44  256.35MB     3.44     2
    30 get_output(opponents, results)    100000       60  97.38ms 124.69ms      5.50  228.88MB     7.33     3
    31 t.sparse(results, opponents)          10       85  16.89µs  18.87µs  43977.     36.22KB     0    10000
    32 get_output(opponents, results)        10       85  10.13µs  11.52µs  70925.     33.48KB     7.09  9999
    33 t.sparse(results, opponents)         100       85 153.67µs 163.72µs   5712.    361.77KB     4.29  1333
    34 get_output(opponents, results)       100       85  68.38µs  72.85µs   9163.    332.31KB     4.36  4208
    35 t.sparse(results, opponents)        1000       85   1.56ms   1.66ms    596.      3.53MB     4.73   252
    36 get_output(opponents, results)      1000       85 719.87µs 747.88µs   1330.      3.24MB     8.76   607
    37 t.sparse(results, opponents)       10000       85  15.39ms   15.8ms     62.5    35.32MB     4.81    26
    38 get_output(opponents, results)     10000       85   7.96ms   8.09ms    111.     32.42MB     9.47    47
    39 t.sparse(results, opponents)      100000       85 194.21ms 211.06ms      4.78  353.24MB     7.97     3
    40 get_output(opponents, results)    100000       85 167.47ms 462.31ms      2.16  324.25MB     3.24     2
    41 t.sparse(results, opponents)          10      110  18.68µs  20.75µs  46004.     45.83KB     4.60  9999
    42 get_output(opponents, results)        10      110  11.91µs  13.23µs  68648.     43.25KB     0    10000
    43 t.sparse(results, opponents)         100      110 171.37µs 186.09µs   5215.    457.86KB     4.84  2156
    44 get_output(opponents, results)       100      110  87.14µs  92.02µs   6690.    429.97KB     4.22  3173
    45 t.sparse(results, opponents)        1000      110    1.9ms   1.96ms    510.      4.47MB     8.10    63
    46 get_output(opponents, results)      1000      110 934.73µs   1.93ms    649.       4.2MB     2.09   311
    47 t.sparse(results, opponents)       10000      110   17.4ms  17.89ms     55.4    44.71MB     5.03    22
    48 get_output(opponents, results)     10000      110  10.43ms  10.68ms     78.4    41.96MB     4.48    35
    49 t.sparse(results, opponents)      100000      110 997.75ms 997.75ms      1.00  447.08MB     1.00     1
    50 get_output(opponents, results)    100000      110 135.68ms  143.2ms      6.71  419.62MB     6.71     4