Function that calculates rolling MAD while ignoring outlier values in R

I am trying to write a function to compute the rolling Median Absolute Deviation (MAD) for outlier detection in a time series dataset. The goal is to:

Detect outliers based on a rolling MAD algorithm (calculate the median and MAD over a window_size of 3).
Exclude previously detected outliers from future MAD calculations.
Ensure the rolling window adapts dynamically by using only non-outlier values.

This is the function I have right now, but it does not work as intended.

detect_anomalies <- function(df, window_size, k) {
  df <- df %>% 
    arrange(Date_Label)

  cleaned_values <- c()  # Stores values that are NOT anomalies
  removed_rows <- list()  # Stores detected anomalies
  
  # Initialize new columns
  df$rolling_median <- NA
  df$abs_dev <- NA
  df$mad_value <- NA
  df$lower_bound <- NA
  df$upper_bound <- NA
  df$anomaly <- FALSE
  
  # Process each row iteratively
  for (i in 1:nrow(df)) {
    # Add the current value to cleaned_values first
    cleaned_values <- c(cleaned_values, df$Attempts[i])

    # Ensure we have enough past values
    if (length(cleaned_values) >= window_size) {
      # Extract past `window_size` values before adding the current row
      past_values <- head(cleaned_values, window_size)
      
      # Compute rolling median & MAD
      median_value <- median(past_values, na.rm = TRUE)
      abs_dev <- abs(df$Attempts[i] - median_value)
      mad_value <- median(tail(abs_dev, window_size), na.rm = TRUE)
      
      # Define anomaly thresholds
      lower_bound <- median_value - k * mad_value
      upper_bound <- median_value + k * mad_value
      
      # Check if the current row is an anomaly
      is_anomaly <- ifelse(df$Attempts[i] < lower_bound, "TRUE", "FALSE")

      # Store values in dataframe
      df$rolling_median[i] <- median_value
      df$abs_dev[i] <- abs_dev
      df$mad_value[i] <- mad_value
      df$lower_bound[i] <- lower_bound
      df$upper_bound[i] <- upper_bound
      df$anomaly[i] <- is_anomaly
    
    # Add current value to cleaned_values only if it's NOT an anomaly
      if (i <= window_size) {  
        cleaned_values <- c(cleaned_values, df$Attempts[i])  # Always add the first window_size values
        } else if (is_anomaly == "FALSE") {  
          cleaned_values <- c(cleaned_values, df$Attempts[i])  # Add only non-anomalous values
          } else {  
            removed_rows[[length(removed_rows) + 1]] <- df[i, ]  # Store anomaly separately
          }
    }
  }
  
  return(list(cleaned_df = df, anomalies = do.call(rbind, removed_rows)))
  }

# Run the anomaly detection function
result <- detect_anomalies(data, window_size = 3, k = 1)

# Cleaned dataset (excluding anomalies)
cleaned_data <- result$cleaned_df

# Anomalous rows
anomalies <- result$anomalies

The dataset looks like this:

Date_Label	Attempts
01-04-2024	186,518
01-05-2024	202,397
01-06-2024	252,707
01-07-2024	236,194
01-08-2024	217,135
01-09-2024	240,986
01-10-2024	205,524
01-11-2024	160,624
01-12-2024	142,238
01-01-2025	193,088

library(tibble)

data <- tibble::tibble(
  Date_Label = as.Date(c("2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", 
                         "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", 
                         "2024-12-01", "2025-01-01")),
  Attempts = c(186518, 202397, 252707, 236194, 217135, 240986, 205524, 
               160624, 142238, 193088)
)

What I want is:

Date_Label	Attempts	rolling_median	abs_dev	mad_value	lower_bound	upper_bound	anomaly
01-04-2024	186,518	NA	NA	NA	NA	NA	NA
01-05-2024	202,397	NA	NA	NA	NA	NA	NA
01-06-2024	252,707	202,397	50310.00	50310	152,087	252,707	False
01-07-2024	236,194	236,194	0.00	25155	211,039	261,349	False
01-08-2024	217,135	236,194	19059.00	19059.00	217,135	255,253	False
01-09-2024	240,986	236,194	4792.00	4792.00	231,402	240,986	False
01-10-2024	205,524	217,135	11611.00	11611.00	205,524	228,746	False
01-11-2024	160,624	205,524	44900.00	11611.00	193,913	217,135	True
01-12-2024	142,238	205,524	63286.00	11611.00	193,913	217,135	True
01-01-2025	193,088	205,524	12436.00	11611.00	193,913	217,135	True

For instance, Attempts in Nov is flagged as an outlier (median (240,986, 205,524, 160,624)). In Dec, the Attempts have been flagged as an outlier based on Sept, Oct, and Dec values (median(240,986, 205,524, 142,238)), Nov Attempts are not included because it was flagged as an outlier, therefore dropped from the next calculation.

Can someone please help me with the code? Thank you.

Edit: You may disagree with the technique here, but we have determined that this works for our business scenario. I am only looking for help in coding this correctly in R. Thank you.

Solution

detect_anomalies <- function(df, k) {
  df <- df %>% 
    arrange(Date_Label)
  
  removed_rows <- data.frame()  # Empty dataframe for removed anomalies

  # Initialize new columns
  df$rolling_median <- NA
  df$abs_dev <- NA
  df$mad_value <- NA
  df$lower_bound <- NA
  df$upper_bound <- NA
  df$anomaly <- NA

i <- 1
while (i <= nrow(df)) {
    if(i < 3) {
    df$rolling_median[i] <- NA
    df$abs_dev[i] <- NA
    df$mad_value[i] <- NA
    df$lower_bound[i] <- NA
    df$upper_bound[i] <- NA
    df$anomaly[i] <- NA
    } else {
      df$rolling_median[i] <- median(df$Attempts[max(1, i-2):i], na.rm = TRUE)  # Rolling median for window 3
      df$abs_dev[i] <- abs(df$Attempts[i] - df$rolling_median[i])
      df$mad_value[i] <- median(df$abs_dev[max(1, i-2):i], na.rm = TRUE)  # Rolling MAD for window 3
      df$lower_bound[i] <- df$rolling_median[i] - (k * df$mad_value[i])
      df$upper_bound[i] <- df$rolling_median[i] + (k * df$mad_value[i])
      df$anomaly[i] <- ifelse(df$Attempts[i] < df$lower_bound[i], "TRUE", "FALSE")

      if (df$anomaly[i] == "TRUE") {  
        removed_rows <- rbind(removed_rows, df[i, ])
        df <- df[-i, ]
        next
        } 
  } 
  
  i <- i + 1  # Increment only if no row is removed
  
  }

df <- bind_rows(df, removed_rows)

}

# Run the anomaly detection function
result <- detect_anomalies(data, k = 1)