rperformanceloopsiterationvectorization

Improving performance of iteration in R


I originally created a for loop to calculate numerous variables that were sometimes dependent on previous iterations e.g. [i] and [i-1].

To improve the performance for larger datasets I attempted to use vectorization and produced the following code, which is quicker. The dataframe I hope to use this code on has a length of ~200,000 observations. Is there anyway I can improve the performance of this code further? I hope to run different scenarios in the future, which will also increase computing time.

# Load required packages
library(dplyr)
library(purrr)

# Set seed for reproducibility
set.seed(42)

# Generate random data for storm
n <- 10000  # number of rows
storm <- data.frame(
  date_time = seq.POSIXt(from = as.POSIXct("2021-01-01"), by = "day", length.out = n),
  temp = runif(n, min = -10, max = 35),
  Rain = runif(n, min = 0, max = 50),
  PET = runif(n, min = 0, max = 10),
  Qin_m3 = runif(n, min = 0, max = 100)
)

# Placeholder functions
TSA_depth.fn <- function(volume, TSA_dimensions) {
  # Example behavior: return volume divided by some constant
  return(volume / TSA_dimensions)
}

soil_infiltration.fn <- function(volume, MRC_df, MRC_name, pipe_base_vol) {
  # Example behavior: return volume times some fraction
  return(volume * 0.1)
}

outlet_pipe.fn <- function(diameter, waterDepth, pipe_base, Cd, maxTSA_height) {
  # Example behavior: return some function of the inputs
  return(diameter * waterDepth * Cd)
}

# Constants
maxTSA_area <- 1000
TSA_dimensions <- 50
pipe_diameter <- 0.5
pipe_base_m <- 0.1
Cd <- 0.6
maxTSA_height <- 2.5
maxTSA_volume <- 2000
MRC_df <- data.frame()  # Example placeholder
MRC_name <- "example"
pipe_base_vol <- 0.1

# Initial data preparation
TSA_model <- storm %>%
  select(date_time, temp, Rain, PET, Qin_m3) %>%
  mutate(Qin_m3 = replace(Qin_m3, 1, 0))

# Initialize new columns with 0 and precompute PET_m3
TSA_model <- TSA_model %>%
  mutate(S = 0, dS = 0, depth = 0, PET_m3 = round((PET / 1000) * maxTSA_area, digits = 3), 
         soil_m3 = 0, pipe_m3 = 0, overflow_m3 = 0, Qout = 0)

# Function to calculate the values for the rows
calculate_values <- function(df) {
  n <- nrow(df)
  for (i in 2:n) {
    df$dS[i] <- df$Qin_m3[i] - df$Qout[i - 1]
    df$S[i] <- df$S[i - 1] + df$dS[i]
    df$depth[i] <- TSA_depth.fn(volume = df$S[i], TSA_dimensions = TSA_dimensions)
    
    soil_infiltration <- soil_infiltration.fn(volume = df$S[i], MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol)
    df$soil_m3[i] <- max(soil_infiltration - df$PET_m3[i], 0)
    df$soil_m3[i] <- ifelse(df$S[i] - df$soil_m3[i] < 0, df$S[i], df$soil_m3[i])
    
    pipe_outflow <- outlet_pipe.fn(diameter = pipe_diameter, waterDepth = df$depth[i], pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height)
    df$pipe_m3[i] <- max(pipe_outflow, 0)
    
    df$overflow_m3[i] <- ifelse(df$S[i] > maxTSA_volume,
                                max(df$S[i] - maxTSA_volume - df$pipe_m3[i] - df$soil_m3[i] - df$PET_m3[i], 0),
                                0)
    
    df$Qout[i] <- df$PET_m3[i] + df$soil_m3[i] + df$pipe_m3[i] + df$overflow_m3[i]
    df$Qout[i] <- ifelse(df$S[i] - df$Qout[i] < 0, df$S[i], df$Qout[i])
  }
  return(df)
}

# Calculate the new values
TSA_model <- calculate_values(TSA_model)

# Display the updated TSA_model
str(TSA_model)
summary(TSA_model)

Solution

  • Without even looking at your calculations it looks like you have two issues with your code:

    1. You are not leveraging vectorization, which is normally faster because many vectorized functions in R are written in lower level programming languages with less overhead
    2. You are really hammering the copy-on-modify in your for loop, which is very inefficient

    If you need previous values for a calculation lag() exists and if you need the next value lead() exists.

    library(dplyr)
    
    TSA_model |>
      mutate(
        dS = Qin_m3 - lag(Qout, default = 0L),
        S = lag(S, default = 0L) + dS,
        depth = TSA_depth.fn(S, TSA_dimensions),
        
        soil_infiltration = soil_infiltration.fn(S, MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol),
        soil_m3 = max(soil_infiltration - PET_m3, 0),
        soil_m3 = ifelse(S - soil_m3 < 0, S, soil_m3),
        
        pipe_outflow = outlet_pipe.fn(pipe_diameter, waterDepth = depth, pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height),
        pipe_m3 = max(pipe_outflow, 0),
        
        overflow_m3 = ifelse(S > maxTSA_volume,
                             max(S - maxTSA_volume - pipe_m3 - soil_m3 - PET_m3, 0),
                             0),
        Qout = PET_m3 + soil_m3 + pipe_m3 + overflow_m3,
        Qout = ifelse(S - Qout < 0, S, Qout)
      )
    

    You should review these calculations closely to ensure they are calculating what you want/need. My focus is just improving performance.

    Benchmark

    Even with just 10 runs, you can see the performance difference is significant.

    f <- function() {
      TSA_model |>
        mutate(
          dS = Qin_m3 - lag(Qout, default = 0L),
          S = lag(S) + dS,
          depth = TSA_depth.fn(S, TSA_dimensions),
          
          soil_infiltration = soil_infiltration.fn(S, MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol),
          soil_m3 = max(soil_infiltration - PET_m3, 0),
          soil_m3 = ifelse(S - soil_m3 < 0, S, soil_m3),
          
          pipe_outflow = outlet_pipe.fn(pipe_diameter, waterDepth = depth, pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height),
          pipe_m3 = max(pipe_outflow, 0),
          
          overflow_m3 = ifelse(S > maxTSA_volume,
                               max(S - maxTSA_volume - pipe_m3 - soil_m3 - PET_m3, 0),
                               0),
          Qout = PET_m3 + soil_m3 + pipe_m3 + overflow_m3,
          Qout = ifelse(S - Qout < 0, S, Qout)
        )
    }
    
    library(microbenchmark)
    
    microbenchmark(
      calculate_values(TSA_model),
      f(), 
      times = 10L
    )
    
    # Unit: milliseconds
    #                         expr       min        lq       mean     median       uq       max neval cld
    #  calculate_values(TSA_model) 1976.5338 1991.7258 2029.16396 2010.32010 2065.407 2103.8642    10  a 
    #                          f()    2.6884    2.7874    3.02746    2.88345    3.081    3.8517    10   b
    

    1. Many of the functions you use are vectorized -- even many of your own functions are, but you are passing them one value at a time, which has a lot of overhead. Vectorization means the function will operation on all elements without you having to explicitly iterate. Often, but not always, functions in R are written at lower level languages that do iterate but can do it with much less overhead and are therefore faster.

    For example, consider multiplication, which is all you are doing in soil_infiltration.fn:

    1:5 * 0.1
    [1] 0.1 0.2 0.3 0.4 0.5
    
    # versus iteration (slower)
    x <- vector(mode = "double", length = 5L)
    for (i in seq_along(x)) x[[i]] <- i * 0.1
    
    # soil_infiltration.fn() is vectorized
    soil_infiltration.fn(1:5)
    # [1] 0.1 0.2 0.3 0.4 0.5
    
    1. copy-on-modify

    You can read more about this but in summary when you change the value of an object in memory a copy is created and that copy is modified. That copy is then binded to the name you assign and the old copy, unless binded to a different name needs to get cleaned up. If we trace your data frame through just one iteration of your loop you can see just how many times this is happening:

    calculate_values <- function(df) {
      n <- nrow(df)
      for (i in 2) { # JUST ONE ITERATION
        df$dS[i] <- df$Qin_m3[i] - df$Qout[i - 1]
        df$S[i] <- df$S[i - 1] + df$dS[i]
        df$depth[i] <- TSA_depth.fn(volume = df$S[i], TSA_dimensions = TSA_dimensions)
        
        soil_infiltration <- soil_infiltration.fn(volume = df$S[i], MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol)
        df$soil_m3[i] <- max(soil_infiltration - df$PET_m3[i], 0)
        df$soil_m3[i] <- ifelse(df$S[i] - df$soil_m3[i] < 0, df$S[i], df$soil_m3[i])
        
        pipe_outflow <- outlet_pipe.fn(diameter = pipe_diameter, waterDepth = df$depth[i], pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height)
        df$pipe_m3[i] <- max(pipe_outflow, 0)
        
        df$overflow_m3[i] <- ifelse(df$S[i] > maxTSA_volume,
                                    max(df$S[i] - maxTSA_volume - df$pipe_m3[i] - df$soil_m3[i] - df$PET_m3[i], 0),
                                    0)
        
        df$Qout[i] <- df$PET_m3[i] + df$soil_m3[i] + df$pipe_m3[i] + df$overflow_m3[i]
        df$Qout[i] <- ifelse(df$S[i] - df$Qout[i] < 0, df$S[i], df$Qout[i])
      }
      return(df)
    }
    
    tracemem(TSA_model)
    [1] "<000002789BBF3738>"
    
    output <- calculate_values(TSA_model)
    # tracemem[0x000002789bbf3738 -> 0x00000278ac6f2828]: calculate_values 
    # tracemem[0x00000278ac6f2828 -> 0x00000278ac6f28d8]: $<-.data.frame $<- calculate_values 
    # tracemem[0x00000278ac6f28d8 -> 0x00000278ac6f2988]: $<-.data.frame $<- calculate_values 
    # tracemem[0x00000278ac6f2988 -> 0x00000278ac6f2a38]: $<-.data.frame $<- calculate_values 
    # tracemem[0x00000278ac6f2a38 -> 0x00000278ac6f2ae8]: $<-.data.frame $<- calculate_values 
    # tracemem[0x00000278ac6f2ae8 -> 0x00000278ac6f2b98]: $<-.data.frame $<- calculate_values 
    # tracemem[0x00000278ac6f2b98 -> 0x00000278ac6f2c48]: $<-.data.frame $<- calculate_values 
    # tracemem[0x00000278ac6f2c48 -> 0x00000278ac6f2cf8]: $<-.data.frame $<- calculate_values 
    # tracemem[0x00000278ac6f2cf8 -> 0x00000278ac6f2da8]: $<-.data.frame $<- calculate_values 
    # tracemem[0x00000278ac6f2da8 -> 0x00000278ac6f2e58]: $<-.data.frame $<- calculate_values 
    
    untracemem(TSA_model)
    

    That is 10 times in a single iteration, 100,000 times in your example, and 2,000,000 times in your real data!