rmedianrolling-computation

Function that calculates rolling MAD while ignoring outlier values in R


I am trying to write a function to compute the rolling Median Absolute Deviation (MAD) for outlier detection in a time series dataset. The goal is to:

  1. Detect outliers based on a rolling MAD algorithm (calculate the median and MAD over a window_size of 3).
  2. Exclude previously detected outliers from future MAD calculations.
  3. Ensure the rolling window adapts dynamically by using only non-outlier values.

This is the function I have right now, but it does not work as intended.

detect_anomalies <- function(df, window_size, k) {
  df <- df %>% 
    arrange(Date_Label)

  cleaned_values <- c()  # Stores values that are NOT anomalies
  removed_rows <- list()  # Stores detected anomalies
  
  # Initialize new columns
  df$rolling_median <- NA
  df$abs_dev <- NA
  df$mad_value <- NA
  df$lower_bound <- NA
  df$upper_bound <- NA
  df$anomaly <- FALSE
  
  # Process each row iteratively
  for (i in 1:nrow(df)) {
    # Add the current value to cleaned_values first
    cleaned_values <- c(cleaned_values, df$Attempts[i])

    # Ensure we have enough past values
    if (length(cleaned_values) >= window_size) {
      # Extract past `window_size` values before adding the current row
      past_values <- head(cleaned_values, window_size)
      
      # Compute rolling median & MAD
      median_value <- median(past_values, na.rm = TRUE)
      abs_dev <- abs(df$Attempts[i] - median_value)
      mad_value <- median(tail(abs_dev, window_size), na.rm = TRUE)
      
      # Define anomaly thresholds
      lower_bound <- median_value - k * mad_value
      upper_bound <- median_value + k * mad_value
      
      # Check if the current row is an anomaly
      is_anomaly <- ifelse(df$Attempts[i] < lower_bound, "TRUE", "FALSE")

      # Store values in dataframe
      df$rolling_median[i] <- median_value
      df$abs_dev[i] <- abs_dev
      df$mad_value[i] <- mad_value
      df$lower_bound[i] <- lower_bound
      df$upper_bound[i] <- upper_bound
      df$anomaly[i] <- is_anomaly
    
    # Add current value to cleaned_values only if it's NOT an anomaly
      if (i <= window_size) {  
        cleaned_values <- c(cleaned_values, df$Attempts[i])  # Always add the first window_size values
        } else if (is_anomaly == "FALSE") {  
          cleaned_values <- c(cleaned_values, df$Attempts[i])  # Add only non-anomalous values
          } else {  
            removed_rows[[length(removed_rows) + 1]] <- df[i, ]  # Store anomaly separately
          }
    }
  }
  
  return(list(cleaned_df = df, anomalies = do.call(rbind, removed_rows)))
  }

# Run the anomaly detection function
result <- detect_anomalies(data, window_size = 3, k = 1)

# Cleaned dataset (excluding anomalies)
cleaned_data <- result$cleaned_df

# Anomalous rows
anomalies <- result$anomalies

The dataset looks like this:

Date_Label Attempts
01-04-2024 186,518
01-05-2024 202,397
01-06-2024 252,707
01-07-2024 236,194
01-08-2024 217,135
01-09-2024 240,986
01-10-2024 205,524
01-11-2024 160,624
01-12-2024 142,238
01-01-2025 193,088
library(tibble)

data <- tibble::tibble(
  Date_Label = as.Date(c("2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", 
                         "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", 
                         "2024-12-01", "2025-01-01")),
  Attempts = c(186518, 202397, 252707, 236194, 217135, 240986, 205524, 
               160624, 142238, 193088)
)

What I want is:

Date_Label Attempts rolling_median abs_dev mad_value lower_bound upper_bound anomaly
01-04-2024 186,518 NA NA NA NA NA NA
01-05-2024 202,397 NA NA NA NA NA NA
01-06-2024 252,707 202,397 50310.00 50310 152,087 252,707 False
01-07-2024 236,194 236,194 0.00 25155 211,039 261,349 False
01-08-2024 217,135 236,194 19059.00 19059.00 217,135 255,253 False
01-09-2024 240,986 236,194 4792.00 4792.00 231,402 240,986 False
01-10-2024 205,524 217,135 11611.00 11611.00 205,524 228,746 False
01-11-2024 160,624 205,524 44900.00 11611.00 193,913 217,135 True
01-12-2024 142,238 205,524 63286.00 11611.00 193,913 217,135 True
01-01-2025 193,088 205,524 12436.00 11611.00 193,913 217,135 True

For instance, Attempts in Nov is flagged as an outlier (median (240,986, 205,524, 160,624)). In Dec, the Attempts have been flagged as an outlier based on Sept, Oct, and Dec values (median(240,986, 205,524, 142,238)), Nov Attempts are not included because it was flagged as an outlier, therefore dropped from the next calculation.

Can someone please help me with the code? Thank you.

Edit: You may disagree with the technique here, but we have determined that this works for our business scenario. I am only looking for help in coding this correctly in R. Thank you.


Solution

  • detect_anomalies <- function(df, k) {
      df <- df %>% 
        arrange(Date_Label)
      
      removed_rows <- data.frame()  # Empty dataframe for removed anomalies
    
      # Initialize new columns
      df$rolling_median <- NA
      df$abs_dev <- NA
      df$mad_value <- NA
      df$lower_bound <- NA
      df$upper_bound <- NA
      df$anomaly <- NA
    
    i <- 1
    while (i <= nrow(df)) {
        if(i < 3) {
        df$rolling_median[i] <- NA
        df$abs_dev[i] <- NA
        df$mad_value[i] <- NA
        df$lower_bound[i] <- NA
        df$upper_bound[i] <- NA
        df$anomaly[i] <- NA
        } else {
          df$rolling_median[i] <- median(df$Attempts[max(1, i-2):i], na.rm = TRUE)  # Rolling median for window 3
          df$abs_dev[i] <- abs(df$Attempts[i] - df$rolling_median[i])
          df$mad_value[i] <- median(df$abs_dev[max(1, i-2):i], na.rm = TRUE)  # Rolling MAD for window 3
          df$lower_bound[i] <- df$rolling_median[i] - (k * df$mad_value[i])
          df$upper_bound[i] <- df$rolling_median[i] + (k * df$mad_value[i])
          df$anomaly[i] <- ifelse(df$Attempts[i] < df$lower_bound[i], "TRUE", "FALSE")
    
          if (df$anomaly[i] == "TRUE") {  
            removed_rows <- rbind(removed_rows, df[i, ])
            df <- df[-i, ]
            next
            } 
      } 
      
      i <- i + 1  # Increment only if no row is removed
      
      }
    
    df <- bind_rows(df, removed_rows)
    
    }
    
    # Run the anomaly detection function
    result <- detect_anomalies(data, k = 1)