pivotpython-polarsmeltlazyframe

Efficient method for adding derived data to a Polars multiIndex LazyFrame


I am working with Polars and need to add derived data to a multiindex LazyFrame. To modify the data, I am pivoting the DataFrame without performing any aggregations, doing some computations, and then melting the DataFrame back to its original format. This operation needs to be done on different index levels. The Polars documentation mentions that pivot operations are not available in lazy mode. Due to the pivoting/unpivoting, it is necessary to switch back and forth between eager and lazy modes. Is there a more efficient way to achieve this without switching between eager and lazy modes?

Here is an example:

import numpy as np
import polars as pl

def get_raw_data() -> pl.LazyFrame:
    """Generate random a multiindex LazyFrame with different size of indexes as example."""
    names = np.array(['A', 'B', 'C'])
    measures = np.array(['height', 'width'])
    repeats: np.ndarray = np.array([3, 3, 2])
    return pl.DataFrame({
        'id': np.repeat(names, repeats*measures.size),
        'measure': np.concatenate([np.repeat(measures, rep) for rep in repeats]),
        'date': np.concatenate([np.arange(size) for size in np.repeat(repeats, measures.size)]),
        'value': np.round(np.random.default_rng(111).random(measures.size*repeats.sum()), 2),
    }).lazy()

print(get_raw_data().collect())
# shape: (16, 4)
# ┌─────┬─────────┬──────┬───────┐
# │ id  ┆ measure ┆ date ┆ value │
# │ --- ┆ ---     ┆ ---  ┆ ---   │
# │ str ┆ str     ┆ i64  ┆ f64   │
# ╞═════╪═════════╪══════╪═══════╡
# │ A   ┆ height  ┆ 0    ┆ 0.15  │
# │ A   ┆ height  ┆ 1    ┆ 0.17  │
# │ A   ┆ height  ┆ 2    ┆ 0.51  │
# │ A   ┆ width   ┆ 0    ┆ 0.66  │
# │ A   ┆ width   ┆ 1    ┆ 0.77  │
# │ …   ┆ …       ┆ …    ┆ …     │
# │ B   ┆ width   ┆ 2    ┆ 0.72  │
# │ C   ┆ height  ┆ 0    ┆ 0.08  │
# │ C   ┆ height  ┆ 1    ┆ 0.42  │
# │ C   ┆ width   ┆ 0    ┆ 0.4   │
# │ C   ┆ width   ┆ 1    ┆ 0.94  │
# └─────┴─────────┴──────┴───────┘

def expr_add_categories() -> pl.Expr:
    """Generate a sample list of expressions to add some derived categories."""
    return [(pl.col('height')/pl.col('width')).alias('ratio')]

def expr_add_ids() -> pl.Expr:
    """Generate a sample list of expressions to add some derived ids."""
    return [
        (pl.col('A') / pl.col('B')).alias('AB'),
        (pl.col('A') / pl.col('C')).alias('AC')
    ]

def add_categories(df: pl.LazyFrame) -> pl.LazyFrame:
    """Add various derived categories to LazyFrame."""
    return (
        df
        .collect()  # pivot requires eager mode
        .pivot(index=['id', 'date'], columns='measure', values='value')
        .lazy()     # back to lazy mode
        .with_columns(expr_add_categories())
        .melt(id_vars=['id', 'date'], variable_name='measure')
        .drop_nulls()
        .select(['id', 'measure', 'date', 'value'])
        .sort(['id', 'measure', 'date'])
        .set_sorted(['id', 'measure', 'date'])
    )

def add_ids(df: pl.LazyFrame) -> pl.LazyFrame:
    """Add various derived IDs to LazyFrame."""
    return (
        df
        .collect()  # pivot requires eager mode
        .pivot(index=['measure', 'date'], columns='id', values='value')
        .lazy()     # back to lazy mode
        .with_columns(expr_add_ids())
        .melt(id_vars=['measure', 'date'], variable_name='id')
        .drop_nulls()
        .select(['id', 'measure', 'date', 'value'])
        .sort(['id', 'measure', 'date'])
        .set_sorted(['id', 'measure', 'date'])
    )

def get_modified_data() -> pl.LazyFrame:
    """Get raw data and add derived categories and names to LazyFrame."""
    return (
        get_raw_data()
        .pipe(add_categories)
        .pipe(add_ids)
    )

print(get_modified_data().collect())
# shape: (39, 4)
# ┌─────┬─────────┬──────┬──────────┐
# │ id  ┆ measure ┆ date ┆ value    │
# │ --- ┆ ---     ┆ ---  ┆ ---      │
# │ str ┆ str     ┆ i64  ┆ f64      │
# ╞═════╪═════════╪══════╪══════════╡
# │ A   ┆ height  ┆ 0    ┆ 0.15     │
# │ A   ┆ height  ┆ 1    ┆ 0.17     │
# │ A   ┆ height  ┆ 2    ┆ 0.51     │
# │ A   ┆ ratio   ┆ 0    ┆ 0.227273 │
# │ A   ┆ ratio   ┆ 1    ┆ 0.220779 │
# │ …   ┆ …       ┆ …    ┆ …        │
# │ C   ┆ height  ┆ 1    ┆ 0.42     │
# │ C   ┆ ratio   ┆ 0    ┆ 0.2      │
# │ C   ┆ ratio   ┆ 1    ┆ 0.446809 │
# │ C   ┆ width   ┆ 0    ┆ 0.4      │
# │ C   ┆ width   ┆ 1    ┆ 0.94     │
# └─────┴─────────┴──────┴──────────┘

# *************************************************************
# Python: 3.12.0
# Numpy: 1.26.4
# Polars: 0.20.31
# *************************************************************

Edit: Please assume the content of the DataFrame is dynamic. The unique elements of any of the index levels is unknown advance.

Edit: Here is an example for "lazypivot" as UDF. I am not sure how to apply an aggregation function without using a lambda function. Better proposals are welcome.

import warnings
from typing import Callable
import polars as pl

def lazypivot(df: pl.LazyFrame,
              index: str | list[str] | None,
              columns: str | list[str] | None,
              values: str | list[str] | None,
              column_values: str | list[str] | None = None,
              aggregate_function: Callable | None = None,
              maintain_order: bool = True,
              sort_columns: bool = True,
              ) -> pl.LazyFrame:
    """Pivot a LazyFrame with or without aggregation."""

    # Collect unique column values if not provided
    if column_values is None:
        warnings.warn(
            'No column_values provided. Switching between eager and lazy mode necessary to collect unique column values.',
            UserWarning
        )
        collected_df = df.collect()
        column_values = collected_df[columns].unique().sort() if sort_columns else collected_df[columns].unique()
        df = collected_df.lazy()

    # Define the aggregation function
    if aggregate_function is None:
        agg_expr = [pl.col(values).filter(pl.col(columns) == value).first().alias(value) for value in column_values]
    else:
        agg_expr = [aggregate_function(pl.col(values).filter(pl.col(columns) == value)).alias(value) for value in column_values]

    # Perform the pivot
    return df.group_by(index, maintain_order=maintain_order).agg(agg_expr)

df = pl.DataFrame(
    {
        "idx": ["A", "A", "A", "B", "B"],
        "cat": ["x", "y", "z", "x", "y"],
        "val": [1, 2, 3, 4, 5],
    }
)
print(df)

print('pivot with column_values:')
df_new = df.lazy().pipe(lazypivot, index="idx", columns="cat", values="val", column_values=['x', 'y', 'z']).collect()
print(df_new)

print('pivot without column_values:')
df_new = df.lazy().pipe(lazypivot, index="idx", columns="cat", values="val").collect()
print(df_new)

# shape: (5, 3)
# ┌─────┬─────┬─────┐
# │ idx ┆ cat ┆ val │
# │ --- ┆ --- ┆ --- │
# │ str ┆ str ┆ i64 │
# ╞═════╪═════╪═════╡
# │ A   ┆ x   ┆ 1   │
# │ A   ┆ y   ┆ 2   │
# │ A   ┆ z   ┆ 3   │
# │ B   ┆ x   ┆ 4   │
# │ B   ┆ y   ┆ 5   │
# └─────┴─────┴─────┘

# pivot with column_values:
# shape: (2, 4)
# ┌─────┬─────┬─────┬──────┐
# │ idx ┆ x   ┆ y   ┆ z    │
# │ --- ┆ --- ┆ --- ┆ ---  │
# │ str ┆ i64 ┆ i64 ┆ i64  │
# ╞═════╪═════╪═════╪══════╡
# │ A   ┆ 1   ┆ 2   ┆ 3    │
# │ B   ┆ 4   ┆ 5   ┆ null │
# └─────┴─────┴─────┴──────┘

# pivot without column_values:
# shape: (2, 4)
# ┌─────┬─────┬─────┬──────┐
# │ idx ┆ x   ┆ y   ┆ z    │
# │ --- ┆ --- ┆ --- ┆ ---  │
# │ str ┆ i64 ┆ i64 ┆ i64  │
# ╞═════╪═════╪═════╪══════╡
# │ A   ┆ 1   ┆ 2   ┆ 3    │
# │ B   ┆ 4   ┆ 5   ┆ null │
# └─────┴─────┴─────┴──────┘
# UserWarning: No column_values provided. Switching between eager and # lazy mode necessary to collect unique column values.

Solution

  • From the documentation of of pl.DataFrame.pivot:

    Note that pivot is only available in eager mode. If you know the unique column values in advance, you can use polars.LazyFrame.groupby() to get the same result as above in lazy mode: [...]

    In your concrete example, you'd need to know the unique values of measure (i.e. "height" and "width") and id (i.e. "A", "B", "C") in advance to to refactor add_categories and add_ids, respectively.

    The refactoring to functions that fully works in lazy mode would look as follows:

    def add_categories(df: pl.LazyFrame) -> pl.LazyFrame:
        """Add various derived categories to LazyFrame."""
        measurement_categories = ["height", "width"]
        return (
            df
            .group_by("id", "date", maintain_order=True)
            .agg(
                pl.col("value").filter(pl.col("measure") == cat).first().alias(cat)
                for cat in measurement_categories
            )
            .with_columns(
                expr_add_categories()
            )
            .melt(id_vars=['id', 'date'], variable_name='measure')
            .drop_nulls()
            .select(['id', 'measure', 'date', 'value'])
            .sort(['id', 'measure', 'date'])
            .set_sorted(['id', 'measure', 'date'])
        )
    
    def add_ids(df: pl.LazyFrame) -> pl.LazyFrame:
        """Add various derived IDs to LazyFrame."""
        ids = ["A", "B", "C"]
        return (
            df
            .group_by("measure", "date", maintain_order=True)
            .agg(
                pl.col("value").filter(pl.col("id") == id).first().alias(id)
                for id in ids
            )
            .with_columns(expr_add_ids())
            .melt(id_vars=['measure', 'date'], variable_name='id')
            .drop_nulls()
            .select(['id', 'measure', 'date', 'value'])
            .sort(['id', 'measure', 'date'])
            .set_sorted(['id', 'measure', 'date'])
        )