rbenchmarking

How to optimize the following conditional assignment of a vector?


I have to classify a lot of crops based on three conditions calculated in a grid of 1e6 points. I'm trying to optimize the function below (hopefully without moving to C or Rust). Any ideas?

Iit's possible to reformat the input data if necessary. I already tried with data.table but the performance was worse.

This is my best shot:

condtion1 <- letters[1:8]
condtion2 <- letters[9:15]
condtion3 <- letters[16:24]

crop <- sample(0:1, 24, replace = T)
names(crop) <- letters[1:24]

n <- 1e6

condtions1 <- sample(condtion1, n, replace = T)
condtions2 <- sample(condtion2, n, replace = T)
condtions3 <- sample(condtion3, n, replace = T)

get_suitability <- function(){
  result <- character(n)
  
  for (i in seq_along(result)) {
    if (crop[[condtions1[[i]]]] == 0 | crop[[condtions2[[i]]]] == 0) result[[i]] <- "not suitable"
    else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
    else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
  }
  result
}


microbenchmark::microbenchmark(
  get_suitability(),
  times = 5
)
#> Unit: seconds
#>               expr      min       lq     mean   median       uq      max neval
#>  get_suitability() 2.402434 2.408322 2.568981 2.641211 2.667943 2.724993     5

Created on 2024-03-24 with reprex v2.1.0


Solution

  • Vectorise over the condtions getting rid of for/if. The logical indices take care of both for and if.
    In a comment to the question I write:

    You can initialize result <- rep("not suitable", n) and remove one of the if's from the loop.

    Notes:

    condtion1 <- letters[1:8]
    condtion2 <- letters[9:15]
    condtion3 <- letters[16:24]
    
    crop <- sample(0:1, 24, replace = T)
    names(crop) <- letters[1:24]
    
    n <- 1e6
    
    condtions1 <- sample(condtion1, n, replace = T)
    condtions2 <- sample(condtion2, n, replace = T)
    condtions3 <- sample(condtion3, n, replace = T)
    
    get_suitability <- function(){
      result <- character(n)
      
      for (i in seq_along(result)) {
        if (crop[[condtions1[[i]]]] == 0 | crop[[condtions2[[i]]]] == 0) result[[i]] <- "not suitable"
        else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
        else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
      }
      result
    }
    get_suitability2 <- function(){
      result <- rep("not suitable", n)
      for (i in seq_along(result)) {
        if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 1) result[[i]] <- "suitable"
        else if(crop[[condtions1[[i]]]] == 1 & crop[[condtions2[[i]]]] == 1 & crop[[condtions3[[i]]]] == 0) result[[i]] <- "suitable with irrigation"
      }
      result
    }
    get_suitability3 <- function(){
      result <- rep("not suitable", n)
      i1 <- crop[ condtions1 ] == 1 
      i2 <- crop[ condtions2 ] == 1
      i3 <- crop[ condtions3 ] == 1
      result[i1 & i2 & i3] <- "suitable"
      result[i1 & i2 & !i3] <- "suitable with irrigation"
      result
    }
    get_suitability3b <- function(){
      result <- rep("not suitable", n)
      i1 <- crop[ condtions1 ] == 1 & crop[ condtions2 ] == 1
      i3 <- crop[ condtions3 ] == 1
      result[i1 & i3] <- "suitable"
      result[i1 & !i3] <- "suitable with irrigation"
      result
    }
    get_suitability4 <- function(){
      result <- ifelse(crop[condtions1] == 0 | 
                         crop[condtions2] == 0, "not suitable",
                       ifelse(crop[condtions3] == 1, "suitable", 
                              "suitable with irrigation"))
      names(result) <- NULL
      result
    }
    
    library(microbenchmark)
    
    res <- get_suitability()
    res2 <- get_suitability2()
    res3 <- get_suitability3()
    res3b <- get_suitability3b()
    res4 <- get_suitability4()
    
    identical(res, res2)
    #> [1] TRUE
    identical(res, res3)
    #> [1] TRUE
    identical(res, res3b)
    #> [1] TRUE
    identical(res, res4)
    #> [1] TRUE
    
    mb <- microbenchmark(
      get_suitability(),
      get_suitability2(),
      get_suitability3(),
      get_suitability3b(),
      get_suitability4(),
      times = 5L
    )
    print(mb, order = "median")
    #> Unit: milliseconds
    #>                 expr       min        lq      mean    median        uq
    #>  get_suitability3b()  120.5004  123.8272  144.3827  137.7121  158.9400
    #>   get_suitability3()  130.9886  141.4570  158.9099  154.2719  179.9035
    #>   get_suitability4()  630.0646  651.2294  677.3693  687.7445  703.8762
    #>    get_suitability() 1496.4989 1522.9126 1540.5882 1535.8001 1566.6336
    #>   get_suitability2() 2999.3825 3008.2696 3064.8530 3083.5560 3102.7165
    #>        max neval  cld
    #>   180.9339     5   c 
    #>   187.9287     5   c 
    #>   713.9316     5    d
    #>  1581.0956     5 a   
    #>  3130.3405     5  b
    

    Created on 2024-03-24 with reprex v2.1.0