rperformancematrix

Find row in Matrix A which is closest two Matrix B


Lets say I have two matrices A and B given by

set.seed(123)
m1 = matrix(runif(10*5), nrow = 10, ncol = 5)
m2 = matrix(runif(10*5), nrow = 10, ncol = 5)

I want to find for each row in matrix A the row in matrix B which is closest to the row in matrix A. I know I can do this by looping over each row in A and comparing it to each row in B like this:

for(i in 1:nrow(m1)){
  dist = 9999
  index = -1
  for(j in 1:nrow(m2)){
    test = sqrt(sum(abs(m1[i,]-m2[j,])))
    
    if (test < dist) {
      dist = test
      index = j
    }
  }
  print(index)
}

I have however a million rows and it takes forever. I'm struggeling to find a efficient way. Any ideas?


Solution

  • Here is one base R solution using apply :

    apply(m1, 1, \(x) which.min(sqrt(colSums(abs(x - t(m2)))))) 
    #[1]  8  3  2  3  3  1  2  3  6 10
    

    Comparing it with your current solution it fares well :

    set.seed(123)
    m1 = matrix(runif(10 * 5), nrow = 10, ncol = 5)
    m2 = matrix(runif(10 * 5), nrow = 10, ncol = 5)
    
    baseR_sol <- function(m1, m2) {
      apply(m1, 1, \(x) which.min(sqrt(colSums(abs(x - t(m2))))))  
    }
    
    for_loop_sol <- function(m1, m2) {
      for(i in 1:nrow(m1)){
        dist = 9999
        index = -1
        for(j in 1:nrow(m2)){
          test = sqrt(sum(abs(m1[i,]-m2[j,])))
          
          if (test < dist) {
            dist = test
            index = j
          }
        }
        print(index)
      }
    }
    
    
    microbenchmark::microbenchmark(
      baseR_sol = baseR_sol(m1, m2), 
      for_loop_sol = for_loop_sol(m1, m2), times = 10L
    )
    
    #         expr   min    lq    mean median     uq    max neval
    #    baseR_sol 158.0 185.2  865.81 195.35  224.8 6902.8    10
    # for_loop_sol 764.6 830.2 1051.29 973.45 1312.0 1348.9    10