[SOLVED] Polars: Fill missing months per group

Polars: Fill missing months per group

I want to fill in missing months in a data frame per group based on the minimum and maximum date in each group.

This approach works but uses polars.map_elements.

import polars as pl
import numpy as np
from datetime import date

DATA_SIZE = 10000000
raw_df = pl.DataFrame({
    "id": np.random.choice(range(0, 10000), DATA_SIZE),
    "date": pl.date_range(date(1940, 1, 1), date(2020, 1, 1), interval="1mo", eager=True).sample(DATA_SIZE, with_replacement=True),
    "value": np.random.rand(DATA_SIZE)
})

clean_df = (raw_df
    .group_by("id")
    .agg(
        pl.struct(pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate"))
        .map_elements(lambda row: pl.date_range(row["startDate"], row["endDate"], interval="1mo", eager=True)).alias("date")
    )
    .explode("date")
    .join(raw_df, how="left", on=["id", "date"])
    .with_columns(pl.col("value").fill_null(0))
)

Now that date_range accepts expressions there should be a more elegant and faster solution, but I can't get it to work: TypeError: cannot create expression literal for value of type builtin_function_or_method.

(raw_df
    .group_by("id")
    .agg(
        pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate")
    )
    .select(
        id, pl.date_range(pl.col("startDate"), pl.col("endDate"), interval="1mo")
    )
)

How can I use the date_range with expressions?

Solution

Your answer is correct but you have a little syntax error, you missed the quotation marks around "id". The error message is a little bit cryptic since id is also a function.

(raw_df
    .group_by("id")
    .agg(
        pl.col("date").min().alias("startDate"), pl.col("date").max().alias("endDate")
    )
    .select(
        "id", pl.date_ranges(pl.col("startDate"), pl.col("endDate"), interval="1mo").alias("date")
    )
    .explode("date")
    .join(raw_df, how="left", on=["id", "date"])
    .with_columns(pl.col("value").fill_null(0))
)