I originally created a for loop to calculate numerous variables that were sometimes dependent on previous iterations e.g. [i] and [i-1].
To improve the performance for larger datasets I attempted to use vectorization and produced the following code, which is quicker. The dataframe I hope to use this code on has a length of ~200,000 observations. Is there anyway I can improve the performance of this code further? I hope to run different scenarios in the future, which will also increase computing time.
# Load required packages
library(dplyr)
library(purrr)
# Set seed for reproducibility
set.seed(42)
# Generate random data for storm
n <- 10000 # number of rows
storm <- data.frame(
date_time = seq.POSIXt(from = as.POSIXct("2021-01-01"), by = "day", length.out = n),
temp = runif(n, min = -10, max = 35),
Rain = runif(n, min = 0, max = 50),
PET = runif(n, min = 0, max = 10),
Qin_m3 = runif(n, min = 0, max = 100)
)
# Placeholder functions
TSA_depth.fn <- function(volume, TSA_dimensions) {
# Example behavior: return volume divided by some constant
return(volume / TSA_dimensions)
}
soil_infiltration.fn <- function(volume, MRC_df, MRC_name, pipe_base_vol) {
# Example behavior: return volume times some fraction
return(volume * 0.1)
}
outlet_pipe.fn <- function(diameter, waterDepth, pipe_base, Cd, maxTSA_height) {
# Example behavior: return some function of the inputs
return(diameter * waterDepth * Cd)
}
# Constants
maxTSA_area <- 1000
TSA_dimensions <- 50
pipe_diameter <- 0.5
pipe_base_m <- 0.1
Cd <- 0.6
maxTSA_height <- 2.5
maxTSA_volume <- 2000
MRC_df <- data.frame() # Example placeholder
MRC_name <- "example"
pipe_base_vol <- 0.1
# Initial data preparation
TSA_model <- storm %>%
select(date_time, temp, Rain, PET, Qin_m3) %>%
mutate(Qin_m3 = replace(Qin_m3, 1, 0))
# Initialize new columns with 0 and precompute PET_m3
TSA_model <- TSA_model %>%
mutate(S = 0, dS = 0, depth = 0, PET_m3 = round((PET / 1000) * maxTSA_area, digits = 3),
soil_m3 = 0, pipe_m3 = 0, overflow_m3 = 0, Qout = 0)
# Function to calculate the values for the rows
calculate_values <- function(df) {
n <- nrow(df)
for (i in 2:n) {
df$dS[i] <- df$Qin_m3[i] - df$Qout[i - 1]
df$S[i] <- df$S[i - 1] + df$dS[i]
df$depth[i] <- TSA_depth.fn(volume = df$S[i], TSA_dimensions = TSA_dimensions)
soil_infiltration <- soil_infiltration.fn(volume = df$S[i], MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol)
df$soil_m3[i] <- max(soil_infiltration - df$PET_m3[i], 0)
df$soil_m3[i] <- ifelse(df$S[i] - df$soil_m3[i] < 0, df$S[i], df$soil_m3[i])
pipe_outflow <- outlet_pipe.fn(diameter = pipe_diameter, waterDepth = df$depth[i], pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height)
df$pipe_m3[i] <- max(pipe_outflow, 0)
df$overflow_m3[i] <- ifelse(df$S[i] > maxTSA_volume,
max(df$S[i] - maxTSA_volume - df$pipe_m3[i] - df$soil_m3[i] - df$PET_m3[i], 0),
0)
df$Qout[i] <- df$PET_m3[i] + df$soil_m3[i] + df$pipe_m3[i] + df$overflow_m3[i]
df$Qout[i] <- ifelse(df$S[i] - df$Qout[i] < 0, df$S[i], df$Qout[i])
}
return(df)
}
# Calculate the new values
TSA_model <- calculate_values(TSA_model)
# Display the updated TSA_model
str(TSA_model)
summary(TSA_model)
Without even looking at your calculations it looks like you have two issues with your code:
for
loop, which is very inefficientIf you need previous values for a calculation lag()
exists and if you need the next value lead()
exists.
library(dplyr)
TSA_model |>
mutate(
dS = Qin_m3 - lag(Qout, default = 0L),
S = lag(S, default = 0L) + dS,
depth = TSA_depth.fn(S, TSA_dimensions),
soil_infiltration = soil_infiltration.fn(S, MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol),
soil_m3 = max(soil_infiltration - PET_m3, 0),
soil_m3 = ifelse(S - soil_m3 < 0, S, soil_m3),
pipe_outflow = outlet_pipe.fn(pipe_diameter, waterDepth = depth, pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height),
pipe_m3 = max(pipe_outflow, 0),
overflow_m3 = ifelse(S > maxTSA_volume,
max(S - maxTSA_volume - pipe_m3 - soil_m3 - PET_m3, 0),
0),
Qout = PET_m3 + soil_m3 + pipe_m3 + overflow_m3,
Qout = ifelse(S - Qout < 0, S, Qout)
)
You should review these calculations closely to ensure they are calculating what you want/need. My focus is just improving performance.
Benchmark
Even with just 10 runs, you can see the performance difference is significant.
f <- function() {
TSA_model |>
mutate(
dS = Qin_m3 - lag(Qout, default = 0L),
S = lag(S) + dS,
depth = TSA_depth.fn(S, TSA_dimensions),
soil_infiltration = soil_infiltration.fn(S, MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol),
soil_m3 = max(soil_infiltration - PET_m3, 0),
soil_m3 = ifelse(S - soil_m3 < 0, S, soil_m3),
pipe_outflow = outlet_pipe.fn(pipe_diameter, waterDepth = depth, pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height),
pipe_m3 = max(pipe_outflow, 0),
overflow_m3 = ifelse(S > maxTSA_volume,
max(S - maxTSA_volume - pipe_m3 - soil_m3 - PET_m3, 0),
0),
Qout = PET_m3 + soil_m3 + pipe_m3 + overflow_m3,
Qout = ifelse(S - Qout < 0, S, Qout)
)
}
library(microbenchmark)
microbenchmark(
calculate_values(TSA_model),
f(),
times = 10L
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# calculate_values(TSA_model) 1976.5338 1991.7258 2029.16396 2010.32010 2065.407 2103.8642 10 a
# f() 2.6884 2.7874 3.02746 2.88345 3.081 3.8517 10 b
For example, consider multiplication, which is all you are doing in soil_infiltration.fn
:
1:5 * 0.1
[1] 0.1 0.2 0.3 0.4 0.5
# versus iteration (slower)
x <- vector(mode = "double", length = 5L)
for (i in seq_along(x)) x[[i]] <- i * 0.1
# soil_infiltration.fn() is vectorized
soil_infiltration.fn(1:5)
# [1] 0.1 0.2 0.3 0.4 0.5
You can read more about this but in summary when you change the value of an object in memory a copy is created and that copy is modified. That copy is then binded to the name you assign and the old copy, unless binded to a different name needs to get cleaned up. If we trace your data frame through just one iteration of your loop you can see just how many times this is happening:
calculate_values <- function(df) {
n <- nrow(df)
for (i in 2) { # JUST ONE ITERATION
df$dS[i] <- df$Qin_m3[i] - df$Qout[i - 1]
df$S[i] <- df$S[i - 1] + df$dS[i]
df$depth[i] <- TSA_depth.fn(volume = df$S[i], TSA_dimensions = TSA_dimensions)
soil_infiltration <- soil_infiltration.fn(volume = df$S[i], MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol)
df$soil_m3[i] <- max(soil_infiltration - df$PET_m3[i], 0)
df$soil_m3[i] <- ifelse(df$S[i] - df$soil_m3[i] < 0, df$S[i], df$soil_m3[i])
pipe_outflow <- outlet_pipe.fn(diameter = pipe_diameter, waterDepth = df$depth[i], pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height)
df$pipe_m3[i] <- max(pipe_outflow, 0)
df$overflow_m3[i] <- ifelse(df$S[i] > maxTSA_volume,
max(df$S[i] - maxTSA_volume - df$pipe_m3[i] - df$soil_m3[i] - df$PET_m3[i], 0),
0)
df$Qout[i] <- df$PET_m3[i] + df$soil_m3[i] + df$pipe_m3[i] + df$overflow_m3[i]
df$Qout[i] <- ifelse(df$S[i] - df$Qout[i] < 0, df$S[i], df$Qout[i])
}
return(df)
}
tracemem(TSA_model)
[1] "<000002789BBF3738>"
output <- calculate_values(TSA_model)
# tracemem[0x000002789bbf3738 -> 0x00000278ac6f2828]: calculate_values
# tracemem[0x00000278ac6f2828 -> 0x00000278ac6f28d8]: $<-.data.frame $<- calculate_values
# tracemem[0x00000278ac6f28d8 -> 0x00000278ac6f2988]: $<-.data.frame $<- calculate_values
# tracemem[0x00000278ac6f2988 -> 0x00000278ac6f2a38]: $<-.data.frame $<- calculate_values
# tracemem[0x00000278ac6f2a38 -> 0x00000278ac6f2ae8]: $<-.data.frame $<- calculate_values
# tracemem[0x00000278ac6f2ae8 -> 0x00000278ac6f2b98]: $<-.data.frame $<- calculate_values
# tracemem[0x00000278ac6f2b98 -> 0x00000278ac6f2c48]: $<-.data.frame $<- calculate_values
# tracemem[0x00000278ac6f2c48 -> 0x00000278ac6f2cf8]: $<-.data.frame $<- calculate_values
# tracemem[0x00000278ac6f2cf8 -> 0x00000278ac6f2da8]: $<-.data.frame $<- calculate_values
# tracemem[0x00000278ac6f2da8 -> 0x00000278ac6f2e58]: $<-.data.frame $<- calculate_values
untracemem(TSA_model)
That is 10 times in a single iteration, 100,000 times in your example, and 2,000,000 times in your real data!