i am trying to implement a solution that finds consequential alarms for an alarm within the given timeframe. i have created an intervalIndex from the dataframe and used np.vectorize to compare timestamps to the interval index but it is slow for large datasets (350000 rows). is there a way to make this faster, i am not sure how to truly vectorize the operation.
def find_top_consequential_alarms(
ui: frontPage.Application, data: pd.DataFrame, fromAr: bool
):
# Get selected alarm and time window based on the source
selected_alarm = (
ui.select_conc_alarm.currentText()
if not fromAr
else ui.ar_alarm_combo.currentText()
)
selected_time = (
ui.select_conc_time_window.currentText()
if not fromAr
else ui.ar_timewindow_combo.currentText()
)
# Convert selected time to timedelta
time_map = {
"5 Minutes": timedelta(minutes=5),
"10 Minutes": timedelta(minutes=10),
"15 Minutes": timedelta(minutes=15),
"30 Minutes": timedelta(minutes=30),
"1 Hour": timedelta(hours=1),
"5 Hours": timedelta(hours=5),
"24 Hours": timedelta(hours=24),
}
time_window = time_map[selected_time]
# Filter for occurrences of the selected alarm
specific_alarms = data[data["PtName"].str.strip() == selected_alarm].copy()
# Create time intervals for each specific alarm occurrence
specific_alarms["window_start"] = specific_alarms["TimestampUTC"]
specific_alarms["window_end"] = specific_alarms["TimestampUTC"] + time_window
intervals = pd.IntervalIndex.from_arrays(
specific_alarms["window_start"], specific_alarms["window_end"], closed="right"
)
# Filter for other alarms that fall within any of the intervals
other_alarms = data[data["PtName"] != selected_alarm].copy()
in_interval = np.vectorize(lambda x: intervals.contains(x).any(), otypes=[bool])
mask = in_interval(other_alarms["TimestampUTC"])
consequential_alarms = other_alarms[mask]
# Count the occurrences of each alarm within the time windows and get the top 10
consequential_alarm_counts = consequential_alarms["PtName"].value_counts().head(10)
title = f"Top 10 Consequential Alarms for {selected_alarm}\n(Time Window: {time_window})"
return consequential_alarm_counts, title
the overall function is above
# Filter for other alarms that fall within any of the intervals
other_alarms = data[data["PtName"] != selected_alarm].copy()
in_interval = np.vectorize(lambda x: intervals.contains(x).any(), otypes=[bool])
mask = in_interval(other_alarms["TimestampUTC"])
consequential_alarms = other_alarms[mask]
this is the part where that needs improvements.
any help would be greatly appreciated.
np.vectorize
works row-by-row, making it slow for large datasets. Instead use pd.merge_asof
, which performs highly efficient, vectorized operations for time-based data alignment.
import pandas as pd
import numpy as np
from datetime import timedelta
import time
np.random.seed(42)
num_rows = 350000
timestamps = pd.date_range(start="2024-11-01", end="2024-11-10", freq="1min")
random_timestamps = np.random.choice(timestamps, num_rows)
alarm_names = [f"Alarm{np.random.randint(1, 100)}" for _ in range(num_rows)]
data = pd.DataFrame({"PtName": alarm_names, "TimestampUTC": random_timestamps})
def find_top_consequential_alarms(ui, data, fromAr):
selected_alarm = ui.select_conc_alarm.currentText() if not fromAr else ui.ar_alarm_combo.currentText()
selected_time = ui.select_conc_time_window.currentText() if not fromAr else ui.ar_timewindow_combo.currentText()
time_map = {
"5 Minutes": timedelta(minutes=5),
"10 Minutes": timedelta(minutes=10),
"15 Minutes": timedelta(minutes=15),
"30 Minutes": timedelta(minutes=30),
"1 Hour": timedelta(hours=1),
"5 Hours": timedelta(hours=5),
"24 Hours": timedelta(hours=24),
}
time_window = time_map[selected_time]
specific_alarms = data[data["PtName"].str.strip() == selected_alarm].copy()
specific_alarms["window_start"] = specific_alarms["TimestampUTC"]
specific_alarms["window_end"] = specific_alarms["TimestampUTC"] + time_window
other_alarms = data[data["PtName"].str.strip() != selected_alarm].copy()
specific_alarms = specific_alarms.sort_values(by="window_start")
other_alarms = other_alarms.sort_values(by="TimestampUTC")
start_time = time.time()
matched_alarms = pd.merge_asof(
other_alarms,
specific_alarms,
left_on="TimestampUTC",
right_on="window_start",
direction="backward",
tolerance=time_window
)
execution_time = time.time() - start_time
consequential_alarms = matched_alarms.dropna(subset=["window_start"])
consequential_alarm_counts = consequential_alarms["PtName_x"].value_counts().head(10)
title = f"Top 10 Consequential Alarms for {selected_alarm}\n(Time Window: {time_window})"
return consequential_alarm_counts, title, execution_time
result_counts, result_title, execution_time = find_top_consequential_alarms(ui, data, fromAr=False)
print(result_counts)
print(result_title)
print(f"Execution Time: {execution_time} seconds")
Which gives you
PtName_x
Alarm19 2954
Alarm36 2951
Alarm41 2951
Alarm31 2939
Alarm15 2933
Alarm75 2932
Alarm83 2932
Alarm40 2932
Alarm88 2927
Alarm54 2927
Name: count, dtype: int64
Top 10 Consequential Alarms for Alarm1
(Time Window: 0:05:00)
Execution Time: 0.022200584411621094 seconds
As you can see, the execution is quite low on a dataframe of 350 000 rows.