I have time series data on different time axes in different dataframes. I need to interpolate data from one df
to onto the time axis of another df, df_ref
. Ex:
import polars as pl
# DataFrame with the reference time axis:
df_ref = pl.DataFrame({"dt": ["2022-12-14T14:00:01.000", "2022-12-14T14:00:02.000",
"2022-12-14T14:00:03.000", "2022-12-14T14:00:04.000",
"2022-12-14T14:00:05.000", "2022-12-14T14:00:06.000"]})
df_ref = df_ref.with_columns(pl.col("dt").str.to_datetime())
# DataFrame with a different frequency time axis, to be interpolated onto the reference time axis:
df = pl.DataFrame({
"dt": ["2022-12-14T14:00:01.500", "2022-12-14T14:00:03.500", "2022-12-14T14:00:05.500"],
"v1": [1.5, 3.5, 5.5]})
df = df.with_columns(pl.col("dt").str.to_datetime())
I cannot join
the dfs since keys don't match:
print(df_ref.join(df, on="dt", how="left").interpolate())
shape: (6, 2)
┌─────────────────────┬──────┐
│ dt ┆ v1 │
│ --- ┆ --- │
│ datetime[μs] ┆ f64 │
╞═════════════════════╪══════╡
│ 2022-12-14 14:00:01 ┆ null │
│ 2022-12-14 14:00:02 ┆ null │
│ 2022-12-14 14:00:03 ┆ null │
│ 2022-12-14 14:00:04 ┆ null │
│ 2022-12-14 14:00:05 ┆ null │
│ 2022-12-14 14:00:06 ┆ null │
└─────────────────────┴──────┘
So my 'iterative' approach would be to interpolate each column individually, for instance like
from scipy.interpolate import interp1d
f = interp1d(df["dt"].dt.timestamp(), df["v1"],
kind="linear", bounds_error=False, fill_value="extrapolate")
out = f(df_ref["dt"].dt.timestamp())
df_ref = df_ref.with_columns(pl.Series(out).alias("v1_interp"))
print(df_ref.head(6))
shape: (6, 2)
┌─────────────────────┬───────────┐
│ dt ┆ v1_interp │
│ --- ┆ --- │
│ datetime[μs] ┆ f64 │
╞═════════════════════╪═══════════╡
│ 2022-12-14 14:00:01 ┆ 1.0 │
│ 2022-12-14 14:00:02 ┆ 2.0 │
│ 2022-12-14 14:00:03 ┆ 3.0 │
│ 2022-12-14 14:00:04 ┆ 4.0 │
│ 2022-12-14 14:00:05 ┆ 5.0 │
│ 2022-12-14 14:00:06 ┆ 6.0 │
└─────────────────────┴───────────┘
Although this gives the result I need, I wonder if there is a more idiomatic approach? I hesitate to mention efficiency here since I haven't benchmarked this with real data yet ("measure, don't guess!"). However, I'd assume that a native implementation in the underlying Rust code could add some performance benefits.
The scipy.interpolate.interpol1d
example ends up calling this function.
You could use the same approach and process each column with .map()
def polars_ip(df_ref, df):
old = df["dt"].dt.timestamp().to_numpy()
new = df_ref["dt"].dt.timestamp().to_numpy()
hi = np.searchsorted(old, new).clip(1, len(old) - 1)
lo = hi - 1
def _interp(column):
column = column.to_numpy()
slope = (column[hi] - column[lo]) / (old[hi] - old[lo])
return pl.Series(slope * (new - old[lo]) + column[lo])
values = (
pl.concat([df, df_ref], how="diagonal")
.select(pl.exclude("dt").map(_interp))
)
values.columns = [f"{name}_ref_ip" for name in values.columns]
return df_ref.hstack(values)
>>> %time polars_ip(df_ref, df)
CPU times: user 48.1 ms, sys: 20.4 ms, total: 68.5 ms
Wall time: 22 ms
shape: (85536, 11)
>>> %time scipy_ip(df_ref, df)
CPU times: user 75.5 ms, sys: 5.51 ms, total: 81 ms
Wall time: 74.3 ms
shape: (85536, 11)
Check they return the same values:
>>> polars_ip(df_ref, df).frame_equal(scipy_ip(df_ref, df))
True
You can also generate the same values using:
N_COLS = 10
names = list(map(str, range(N_COLS)))
has_reading = pl.col(names[0]).is_not_null()
has_no_reading = has_reading.is_not()
(
pl.concat([df, df_ref], how="diagonal")
.sort("dt")
.with_columns([
pl.when(has_reading).then(pl.all())
.shift(-1).backward_fill().suffix("_hi"),
pl.when(has_reading).then(pl.all())
.shift(+1).forward_fill().suffix("_lo")
])
.with_columns([
pl.when(has_reading).then(pl.col(r"^.+_hi$"))
.forward_fill().backward_fill(),
pl.when(has_reading).then(pl.col(r"^.+_lo$"))
.backward_fill().forward_fill()
])
.filter(has_no_reading)
.with_column(
pl.col(r"^dt.*$").dt.timestamp().suffix("_ts"))
.with_columns([
(((pl.col(f"{name}_hi") - pl.col(f"{name}_lo"))
/ (pl.col("dt_hi_ts") - pl.col("dt_lo_ts")))
* (pl.col("dt_ts") - pl.col("dt_lo_ts"))
+ pl.col(f"{name}_lo"))
.alias(f"{name}_ref_ip") for name in names
])
.select([
pl.col("dt"), pl.col("^.+_ref_ip$")
])
)