I have two groupby columns (patient, encounter), a timestamp column and a variable column (value) that i want to forward fill.
I only want to forward fill value in a row in the group, if the time difference between the timestamp column in the current row and the row containing the last entered value (not the last forward filled value) is less than a certain amount - say 60 minutes.
Here's an example before we do the forward fill:
and now what i want after the forward fill:
I have supplied the code below where i use a custom 'apply' method, but I have 123M rows of data and around 20 variable columns (C) to forward fill. Each column is taking too long to forward fill and I'm wondering if there is vector method I could use that would reduce the time.
Here is my group method:
def ffill_across_episodes(group, targetCol,maxMinsDiff):
curPat = 0
curEnc = 0
lastValidEntry = np.nan
hasValidEntry = False
validTimestamp = None
for index, row in group.iterrows():
#print(f"Target col timestamp = {row['Event_timestamp']}, value = {row[targetCol]}")
if curPat!=row['PATIENT_ID'] or curEnc != row['ENCNTR_ID']: # change in patient - reset everything
#print(f"change patient or encounter")
curPat = row['PATIENT_ID']
curEnc = row['ENCNTR_ID']
if np.isnan(row[targetCol]):
hasValidEntry=False
#print(f"set NON valid prior entry")
else:
#print(f"set valid prior entry")
hasValidEntry=True
lastValidEntry=row[targetCol]
validTimestamp = row['Event_timestamp']
else: # same encounter and patient
if np.isnan(row[targetCol]): # have a case for fillforward
if hasValidEntry:
#print(f"has valid prior entry. Timediff is: {( row['Event_timestamp']-validTimestamp).total_seconds()/60:.2f} mins")
if ( row['Event_timestamp']-validTimestamp).total_seconds()/60 < maxMinsDiff:
group.at[index, targetCol] = lastValidEntry
#print(f"set index={index} entry to {lastValidEntry}")
else: # is a valid entry, so reset the valid entries
#print(f"set valid prior entry")
hasValidEntry=True
lastValidEntry=row[targetCol]
validTimestamp = row['Event_timestamp']
return group
and here is my forward fill code:
for aVar in VITALS_COLS:
if aVar in df.columns:
print(f"Fwd filling {aVar} column by patient, encounter and across episode boundaries")
maxMinsDiff = MAX_HR_INTERVAL_BTW_VITALS*60
df_updated = df.groupby(GROUP_BY_ENCOUNTER_COLS).apply(ffill_across_episodes,aVar,maxMinsDiff)
else:
print(f"error: {aVar} is not a column within the dataframe")
df = df_updated
df_updated=None
df["last_valid_timestamp"] = df["timestamp"].where(~df["value"].isna()).ffill()
df["timedelta"] = df["timestamp"] - df["last_valid_timestamp"]
df.loc[df["timedelta"].dt.days <= 1, "value"] = df["value"].ffill()
df = df.drop(columns=["last_valid_timestamp", "timedelta"])
Original DataFrame:
timestamp value
0 2022-01-01 NaN
1 2022-01-02 cat
2 2022-01-03 NaN
3 2022-01-04 NaN
4 2022-01-05 dog
New DataFrame:
timestamp value
0 2022-01-01 NaN
1 2022-01-02 cat
2 2022-01-03 cat
3 2022-01-04 NaN
4 2022-01-05 dog