I need to compute all month ends between two date columns and explode the resulting lists.
import polars as pl
from datetime import datetime
df = pl.DataFrame(
{
"id": ["A", "A", "A", "B", "B"],
"value": ["1", "2", "3", "4", "5"],
"valid_from": [
datetime(2020, 1, 1),
datetime(2021, 1, 1),
datetime(2022, 1, 1),
datetime(2020, 1, 1),
datetime(2021, 1, 1),
],
"valid_to": [
datetime(2020, 12, 31),
datetime(2021, 12, 31),
datetime(2022, 12, 31),
datetime(2020, 12, 31),
datetime(2021, 12, 31),
],
}
)
def __month_range(dict):
start,end = dict.values()
return pl.date_range(start, end, "1mo", eager=True).dt.month_end()
df.with_columns(
pl.struct("valid_from","valid_to").map_elements(__month_range).alias("test")
).explode("test")
is this the way to do it? Or is there a simpler / faster approach without using struct ?
[Update]: pl.date_ranges()
(plural) was added in 0.18.9
which does this directly.
>>> pl.date_ranges("valid_from", "valid_to")
<polars.expr.expr.Expr at 0x135015b70>
df.with_columns(date = pl.date_ranges("valid_from", "valid_to"))
# shape: (5, 5)
# ┌─────┬───────┬─────────────────────┬─────────────────────┬───────────────────────────────────┐
# │ id ┆ value ┆ valid_from ┆ valid_to ┆ date │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ str ┆ datetime[μs] ┆ datetime[μs] ┆ list[datetime[μs]] │
# ╞═════╪═══════╪═════════════════════╪═════════════════════╪═══════════════════════════════════╡
# │ A ┆ 1 ┆ 2020-01-01 00:00:00 ┆ 2020-12-31 00:00:00 ┆ [2020-01-01 00:00:00, 2020-01-02… │
# │ A ┆ 2 ┆ 2021-01-01 00:00:00 ┆ 2021-12-31 00:00:00 ┆ [2021-01-01 00:00:00, 2021-01-02… │
# │ A ┆ 3 ┆ 2022-01-01 00:00:00 ┆ 2022-12-31 00:00:00 ┆ [2022-01-01 00:00:00, 2022-01-02… │
# │ B ┆ 4 ┆ 2020-01-01 00:00:00 ┆ 2020-12-31 00:00:00 ┆ [2020-01-01 00:00:00, 2020-01-02… │
# │ B ┆ 5 ┆ 2021-01-01 00:00:00 ┆ 2021-12-31 00:00:00 ┆ [2021-01-01 00:00:00, 2021-01-02… │
# └─────┴───────┴─────────────────────┴─────────────────────┴───────────────────────────────────┘
(df.with_columns(date = pl.date_ranges("valid_from", "valid_to"))
.explode("date")
.with_columns(month_end = pl.col("date").dt.month_end())
)
# shape: (1_827, 6)
# ┌─────┬───────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┐
# │ id ┆ value ┆ valid_from ┆ valid_to ┆ date ┆ month_end │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
# ╞═════╪═══════╪═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════╡
# │ A ┆ 1 ┆ 2020-01-01 00:00:00 ┆ 2020-12-31 00:00:00 ┆ 2020-01-01 00:00:00 ┆ 2020-01-31 00:00:00 │
# │ A ┆ 1 ┆ 2020-01-01 00:00:00 ┆ 2020-12-31 00:00:00 ┆ 2020-01-02 00:00:00 ┆ 2020-01-31 00:00:00 │
# │ A ┆ 1 ┆ 2020-01-01 00:00:00 ┆ 2020-12-31 00:00:00 ┆ 2020-01-03 00:00:00 ┆ 2020-01-31 00:00:00 │
# │ A ┆ 1 ┆ 2020-01-01 00:00:00 ┆ 2020-12-31 00:00:00 ┆ 2020-01-04 00:00:00 ┆ 2020-01-31 00:00:00 │
# │ … ┆ … ┆ … ┆ … ┆ … ┆ … │
# │ B ┆ 5 ┆ 2021-01-01 00:00:00 ┆ 2021-12-31 00:00:00 ┆ 2021-12-28 00:00:00 ┆ 2021-12-31 00:00:00 │
# │ B ┆ 5 ┆ 2021-01-01 00:00:00 ┆ 2021-12-31 00:00:00 ┆ 2021-12-29 00:00:00 ┆ 2021-12-31 00:00:00 │
# │ B ┆ 5 ┆ 2021-01-01 00:00:00 ┆ 2021-12-31 00:00:00 ┆ 2021-12-30 00:00:00 ┆ 2021-12-31 00:00:00 │
# │ B ┆ 5 ┆ 2021-01-01 00:00:00 ┆ 2021-12-31 00:00:00 ┆ 2021-12-31 00:00:00 ┆ 2021-12-31 00:00:00 │
# └─────┴───────┴─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────┘