is there a faster way to solve time interval comparison in pandas?

i am trying to implement a solution that finds consequential alarms for an alarm within the given timeframe. i have created an intervalIndex from the dataframe and used np.vectorize to compare timestamps to the interval index but it is slow for large datasets (350000 rows). is there a way to make this faster, i am not sure how to truly vectorize the operation.

def find_top_consequential_alarms(
    ui: frontPage.Application, data: pd.DataFrame, fromAr: bool
):

    # Get selected alarm and time window based on the source
    selected_alarm = (
        ui.select_conc_alarm.currentText()
        if not fromAr
        else ui.ar_alarm_combo.currentText()
    )
    selected_time = (
        ui.select_conc_time_window.currentText()
        if not fromAr
        else ui.ar_timewindow_combo.currentText()
    )

    # Convert selected time to timedelta
    time_map = {
        "5 Minutes": timedelta(minutes=5),
        "10 Minutes": timedelta(minutes=10),
        "15 Minutes": timedelta(minutes=15),
        "30 Minutes": timedelta(minutes=30),
        "1 Hour": timedelta(hours=1),
        "5 Hours": timedelta(hours=5),
        "24 Hours": timedelta(hours=24),
    }
    time_window = time_map[selected_time]

    # Filter for occurrences of the selected alarm
    specific_alarms = data[data["PtName"].str.strip() == selected_alarm].copy()

    # Create time intervals for each specific alarm occurrence
    specific_alarms["window_start"] = specific_alarms["TimestampUTC"]
    specific_alarms["window_end"] = specific_alarms["TimestampUTC"] + time_window
    intervals = pd.IntervalIndex.from_arrays(
        specific_alarms["window_start"], specific_alarms["window_end"], closed="right"
    )

    # Filter for other alarms that fall within any of the intervals
    other_alarms = data[data["PtName"] != selected_alarm].copy()
    in_interval = np.vectorize(lambda x: intervals.contains(x).any(), otypes=[bool])
    mask = in_interval(other_alarms["TimestampUTC"])
    consequential_alarms = other_alarms[mask]

    # Count the occurrences of each alarm within the time windows and get the top 10
    consequential_alarm_counts = consequential_alarms["PtName"].value_counts().head(10)

    title = f"Top 10 Consequential Alarms for {selected_alarm}\n(Time Window: {time_window})"
    return consequential_alarm_counts, title

the overall function is above

# Filter for other alarms that fall within any of the intervals
other_alarms = data[data["PtName"] != selected_alarm].copy()
in_interval = np.vectorize(lambda x: intervals.contains(x).any(), otypes=[bool])
mask = in_interval(other_alarms["TimestampUTC"])
consequential_alarms = other_alarms[mask]

this is the part where that needs improvements.

any help would be greatly appreciated.

Solution

np.vectorize works row-by-row, making it slow for large datasets. Instead use pd.merge_asof, which performs highly efficient, vectorized operations for time-based data alignment.

import pandas as pd
import numpy as np
from datetime import timedelta
import time

np.random.seed(42)
num_rows = 350000

timestamps = pd.date_range(start="2024-11-01", end="2024-11-10", freq="1min")
random_timestamps = np.random.choice(timestamps, num_rows)
alarm_names = [f"Alarm{np.random.randint(1, 100)}" for _ in range(num_rows)]

data = pd.DataFrame({"PtName": alarm_names, "TimestampUTC": random_timestamps})
def find_top_consequential_alarms(ui, data, fromAr):
    selected_alarm = ui.select_conc_alarm.currentText() if not fromAr else ui.ar_alarm_combo.currentText()
    selected_time = ui.select_conc_time_window.currentText() if not fromAr else ui.ar_timewindow_combo.currentText()

    time_map = {
        "5 Minutes": timedelta(minutes=5),
        "10 Minutes": timedelta(minutes=10),
        "15 Minutes": timedelta(minutes=15),
        "30 Minutes": timedelta(minutes=30),
        "1 Hour": timedelta(hours=1),
        "5 Hours": timedelta(hours=5),
        "24 Hours": timedelta(hours=24),
    }
    time_window = time_map[selected_time]

    specific_alarms = data[data["PtName"].str.strip() == selected_alarm].copy()
    specific_alarms["window_start"] = specific_alarms["TimestampUTC"]
    specific_alarms["window_end"] = specific_alarms["TimestampUTC"] + time_window
    other_alarms = data[data["PtName"].str.strip() != selected_alarm].copy()
    specific_alarms = specific_alarms.sort_values(by="window_start")
    other_alarms = other_alarms.sort_values(by="TimestampUTC")

    start_time = time.time()
    matched_alarms = pd.merge_asof(
        other_alarms,
        specific_alarms,
        left_on="TimestampUTC",
        right_on="window_start",
        direction="backward",
        tolerance=time_window
    )
    execution_time = time.time() - start_time

    consequential_alarms = matched_alarms.dropna(subset=["window_start"])
    consequential_alarm_counts = consequential_alarms["PtName_x"].value_counts().head(10)
    title = f"Top 10 Consequential Alarms for {selected_alarm}\n(Time Window: {time_window})"
    return consequential_alarm_counts, title, execution_time

result_counts, result_title, execution_time = find_top_consequential_alarms(ui, data, fromAr=False)

print(result_counts)
print(result_title)
print(f"Execution Time: {execution_time} seconds")

Which gives you

PtName_x
Alarm19    2954
Alarm36    2951
Alarm41    2951
Alarm31    2939
Alarm15    2933
Alarm75    2932
Alarm83    2932
Alarm40    2932
Alarm88    2927
Alarm54    2927
Name: count, dtype: int64
Top 10 Consequential Alarms for Alarm1
(Time Window: 0:05:00)
Execution Time: 0.022200584411621094 seconds

As you can see, the execution is quite low on a dataframe of 350 000 rows.