I am trying to write a function to compute the rolling Median Absolute Deviation (MAD) for outlier detection in a time series dataset. The goal is to:
This is the function I have right now, but it does not work as intended.
detect_anomalies <- function(df, window_size, k) {
df <- df %>%
arrange(Date_Label)
cleaned_values <- c() # Stores values that are NOT anomalies
removed_rows <- list() # Stores detected anomalies
# Initialize new columns
df$rolling_median <- NA
df$abs_dev <- NA
df$mad_value <- NA
df$lower_bound <- NA
df$upper_bound <- NA
df$anomaly <- FALSE
# Process each row iteratively
for (i in 1:nrow(df)) {
# Add the current value to cleaned_values first
cleaned_values <- c(cleaned_values, df$Attempts[i])
# Ensure we have enough past values
if (length(cleaned_values) >= window_size) {
# Extract past `window_size` values before adding the current row
past_values <- head(cleaned_values, window_size)
# Compute rolling median & MAD
median_value <- median(past_values, na.rm = TRUE)
abs_dev <- abs(df$Attempts[i] - median_value)
mad_value <- median(tail(abs_dev, window_size), na.rm = TRUE)
# Define anomaly thresholds
lower_bound <- median_value - k * mad_value
upper_bound <- median_value + k * mad_value
# Check if the current row is an anomaly
is_anomaly <- ifelse(df$Attempts[i] < lower_bound, "TRUE", "FALSE")
# Store values in dataframe
df$rolling_median[i] <- median_value
df$abs_dev[i] <- abs_dev
df$mad_value[i] <- mad_value
df$lower_bound[i] <- lower_bound
df$upper_bound[i] <- upper_bound
df$anomaly[i] <- is_anomaly
# Add current value to cleaned_values only if it's NOT an anomaly
if (i <= window_size) {
cleaned_values <- c(cleaned_values, df$Attempts[i]) # Always add the first window_size values
} else if (is_anomaly == "FALSE") {
cleaned_values <- c(cleaned_values, df$Attempts[i]) # Add only non-anomalous values
} else {
removed_rows[[length(removed_rows) + 1]] <- df[i, ] # Store anomaly separately
}
}
}
return(list(cleaned_df = df, anomalies = do.call(rbind, removed_rows)))
}
# Run the anomaly detection function
result <- detect_anomalies(data, window_size = 3, k = 1)
# Cleaned dataset (excluding anomalies)
cleaned_data <- result$cleaned_df
# Anomalous rows
anomalies <- result$anomalies
The dataset looks like this:
Date_Label | Attempts |
---|---|
01-04-2024 | 186,518 |
01-05-2024 | 202,397 |
01-06-2024 | 252,707 |
01-07-2024 | 236,194 |
01-08-2024 | 217,135 |
01-09-2024 | 240,986 |
01-10-2024 | 205,524 |
01-11-2024 | 160,624 |
01-12-2024 | 142,238 |
01-01-2025 | 193,088 |
library(tibble)
data <- tibble::tibble(
Date_Label = as.Date(c("2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01",
"2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01",
"2024-12-01", "2025-01-01")),
Attempts = c(186518, 202397, 252707, 236194, 217135, 240986, 205524,
160624, 142238, 193088)
)
What I want is:
Date_Label | Attempts | rolling_median | abs_dev | mad_value | lower_bound | upper_bound | anomaly |
---|---|---|---|---|---|---|---|
01-04-2024 | 186,518 | NA | NA | NA | NA | NA | NA |
01-05-2024 | 202,397 | NA | NA | NA | NA | NA | NA |
01-06-2024 | 252,707 | 202,397 | 50310.00 | 50310 | 152,087 | 252,707 | False |
01-07-2024 | 236,194 | 236,194 | 0.00 | 25155 | 211,039 | 261,349 | False |
01-08-2024 | 217,135 | 236,194 | 19059.00 | 19059.00 | 217,135 | 255,253 | False |
01-09-2024 | 240,986 | 236,194 | 4792.00 | 4792.00 | 231,402 | 240,986 | False |
01-10-2024 | 205,524 | 217,135 | 11611.00 | 11611.00 | 205,524 | 228,746 | False |
01-11-2024 | 160,624 | 205,524 | 44900.00 | 11611.00 | 193,913 | 217,135 | True |
01-12-2024 | 142,238 | 205,524 | 63286.00 | 11611.00 | 193,913 | 217,135 | True |
01-01-2025 | 193,088 | 205,524 | 12436.00 | 11611.00 | 193,913 | 217,135 | True |
For instance, Attempts in Nov is flagged as an outlier (median (240,986, 205,524, 160,624)). In Dec, the Attempts have been flagged as an outlier based on Sept, Oct, and Dec values (median(240,986, 205,524, 142,238)), Nov Attempts are not included because it was flagged as an outlier, therefore dropped from the next calculation.
Can someone please help me with the code? Thank you.
Edit: You may disagree with the technique here, but we have determined that this works for our business scenario. I am only looking for help in coding this correctly in R. Thank you.
detect_anomalies <- function(df, k) {
df <- df %>%
arrange(Date_Label)
removed_rows <- data.frame() # Empty dataframe for removed anomalies
# Initialize new columns
df$rolling_median <- NA
df$abs_dev <- NA
df$mad_value <- NA
df$lower_bound <- NA
df$upper_bound <- NA
df$anomaly <- NA
i <- 1
while (i <= nrow(df)) {
if(i < 3) {
df$rolling_median[i] <- NA
df$abs_dev[i] <- NA
df$mad_value[i] <- NA
df$lower_bound[i] <- NA
df$upper_bound[i] <- NA
df$anomaly[i] <- NA
} else {
df$rolling_median[i] <- median(df$Attempts[max(1, i-2):i], na.rm = TRUE) # Rolling median for window 3
df$abs_dev[i] <- abs(df$Attempts[i] - df$rolling_median[i])
df$mad_value[i] <- median(df$abs_dev[max(1, i-2):i], na.rm = TRUE) # Rolling MAD for window 3
df$lower_bound[i] <- df$rolling_median[i] - (k * df$mad_value[i])
df$upper_bound[i] <- df$rolling_median[i] + (k * df$mad_value[i])
df$anomaly[i] <- ifelse(df$Attempts[i] < df$lower_bound[i], "TRUE", "FALSE")
if (df$anomaly[i] == "TRUE") {
removed_rows <- rbind(removed_rows, df[i, ])
df <- df[-i, ]
next
}
}
i <- i + 1 # Increment only if no row is removed
}
df <- bind_rows(df, removed_rows)
}
# Run the anomaly detection function
result <- detect_anomalies(data, k = 1)