pythonpandasnumpyvectorization

is there a faster way to solve time interval comparison in pandas?


i am trying to implement a solution that finds consequential alarms for an alarm within the given timeframe. i have created an intervalIndex from the dataframe and used np.vectorize to compare timestamps to the interval index but it is slow for large datasets (350000 rows). is there a way to make this faster, i am not sure how to truly vectorize the operation.

def find_top_consequential_alarms(
    ui: frontPage.Application, data: pd.DataFrame, fromAr: bool
):

    # Get selected alarm and time window based on the source
    selected_alarm = (
        ui.select_conc_alarm.currentText()
        if not fromAr
        else ui.ar_alarm_combo.currentText()
    )
    selected_time = (
        ui.select_conc_time_window.currentText()
        if not fromAr
        else ui.ar_timewindow_combo.currentText()
    )

    # Convert selected time to timedelta
    time_map = {
        "5 Minutes": timedelta(minutes=5),
        "10 Minutes": timedelta(minutes=10),
        "15 Minutes": timedelta(minutes=15),
        "30 Minutes": timedelta(minutes=30),
        "1 Hour": timedelta(hours=1),
        "5 Hours": timedelta(hours=5),
        "24 Hours": timedelta(hours=24),
    }
    time_window = time_map[selected_time]

    # Filter for occurrences of the selected alarm
    specific_alarms = data[data["PtName"].str.strip() == selected_alarm].copy()

    # Create time intervals for each specific alarm occurrence
    specific_alarms["window_start"] = specific_alarms["TimestampUTC"]
    specific_alarms["window_end"] = specific_alarms["TimestampUTC"] + time_window
    intervals = pd.IntervalIndex.from_arrays(
        specific_alarms["window_start"], specific_alarms["window_end"], closed="right"
    )

    # Filter for other alarms that fall within any of the intervals
    other_alarms = data[data["PtName"] != selected_alarm].copy()
    in_interval = np.vectorize(lambda x: intervals.contains(x).any(), otypes=[bool])
    mask = in_interval(other_alarms["TimestampUTC"])
    consequential_alarms = other_alarms[mask]

    # Count the occurrences of each alarm within the time windows and get the top 10
    consequential_alarm_counts = consequential_alarms["PtName"].value_counts().head(10)

    title = f"Top 10 Consequential Alarms for {selected_alarm}\n(Time Window: {time_window})"
    return consequential_alarm_counts, title

the overall function is above

# Filter for other alarms that fall within any of the intervals
other_alarms = data[data["PtName"] != selected_alarm].copy()
in_interval = np.vectorize(lambda x: intervals.contains(x).any(), otypes=[bool])
mask = in_interval(other_alarms["TimestampUTC"])
consequential_alarms = other_alarms[mask]

this is the part where that needs improvements.

any help would be greatly appreciated.


Solution

  • np.vectorize works row-by-row, making it slow for large datasets. Instead use pd.merge_asof, which performs highly efficient, vectorized operations for time-based data alignment.

    import pandas as pd
    import numpy as np
    from datetime import timedelta
    import time
    
    np.random.seed(42)
    num_rows = 350000
    
    timestamps = pd.date_range(start="2024-11-01", end="2024-11-10", freq="1min")
    random_timestamps = np.random.choice(timestamps, num_rows)
    alarm_names = [f"Alarm{np.random.randint(1, 100)}" for _ in range(num_rows)]
    
    data = pd.DataFrame({"PtName": alarm_names, "TimestampUTC": random_timestamps})
    def find_top_consequential_alarms(ui, data, fromAr):
        selected_alarm = ui.select_conc_alarm.currentText() if not fromAr else ui.ar_alarm_combo.currentText()
        selected_time = ui.select_conc_time_window.currentText() if not fromAr else ui.ar_timewindow_combo.currentText()
    
        time_map = {
            "5 Minutes": timedelta(minutes=5),
            "10 Minutes": timedelta(minutes=10),
            "15 Minutes": timedelta(minutes=15),
            "30 Minutes": timedelta(minutes=30),
            "1 Hour": timedelta(hours=1),
            "5 Hours": timedelta(hours=5),
            "24 Hours": timedelta(hours=24),
        }
        time_window = time_map[selected_time]
    
        specific_alarms = data[data["PtName"].str.strip() == selected_alarm].copy()
        specific_alarms["window_start"] = specific_alarms["TimestampUTC"]
        specific_alarms["window_end"] = specific_alarms["TimestampUTC"] + time_window
        other_alarms = data[data["PtName"].str.strip() != selected_alarm].copy()
        specific_alarms = specific_alarms.sort_values(by="window_start")
        other_alarms = other_alarms.sort_values(by="TimestampUTC")
    
        start_time = time.time()
        matched_alarms = pd.merge_asof(
            other_alarms,
            specific_alarms,
            left_on="TimestampUTC",
            right_on="window_start",
            direction="backward",
            tolerance=time_window
        )
        execution_time = time.time() - start_time
    
        consequential_alarms = matched_alarms.dropna(subset=["window_start"])
        consequential_alarm_counts = consequential_alarms["PtName_x"].value_counts().head(10)
        title = f"Top 10 Consequential Alarms for {selected_alarm}\n(Time Window: {time_window})"
        return consequential_alarm_counts, title, execution_time
    
    result_counts, result_title, execution_time = find_top_consequential_alarms(ui, data, fromAr=False)
    
    print(result_counts)
    print(result_title)
    print(f"Execution Time: {execution_time} seconds")
    
    
    
    

    Which gives you

    PtName_x
    Alarm19    2954
    Alarm36    2951
    Alarm41    2951
    Alarm31    2939
    Alarm15    2933
    Alarm75    2932
    Alarm83    2932
    Alarm40    2932
    Alarm88    2927
    Alarm54    2927
    Name: count, dtype: int64
    Top 10 Consequential Alarms for Alarm1
    (Time Window: 0:05:00)
    Execution Time: 0.022200584411621094 seconds
    

    As you can see, the execution is quite low on a dataframe of 350 000 rows.