pythondataframejoinpython-polars

How to perform split/merge/unpivot with Python and polars?


I have a data transformation problem where the original data consists of "blocks" of three rows of data, where the first row denotes a 'parent' and the two others are related children. A minimum working example looks like this:

import polars as pl
df_original = pl.DataFrame(
    {
        'Order ID': ['A', 'foo', 'bar'],
        'Parent Order ID': [None, 'A', 'A'],
        'Direction': ["Buy", "Buy", "Sell"],
        'Price': [1.21003, None, 1.21003],
        'Some Value': [4, 4, 4],
        'Name Provider 1': ['P8', 'P8', 'P8'],
        'Quote Provider 1': [None, 1.1, 1.3],
        'Name Provider 2': ['P2', 'P2', 'P2'],
        'Quote Provider 2': [None, 1.15, 1.25],
        'Name Provider 3': ['P1', 'P1', 'P1'],
        'Quote Provider 3': [None, 1.0, 1.4],
        'Name Provider 4': ['P5', 'P5', 'P5'],
        'Quote Provider 4': [None, 1.0, 1.4]
    }
)

In reality, there are up to 15 Providers (so up to 30 columns), but they are not necessary for the example.

We would like to transform this into a format where each row represents both the Buy and Sell quote of a single provider for that parent. The desired result is as follows:

df_desired = pl.DataFrame(
    {
        'Order ID': ['A', 'A', 'A', 'A'],
        'Parent Direction': ['Buy', 'Buy', 'Buy', 'Buy'],
        'Price': [1.21003, 1.21003, 1.21003, 1.21003],
        'Some Value': [4, 4, 4, 4],
        'Name Provider': ['P8', 'P2', 'P1', 'P5'],
        'Quote Buy': [1.1, 1.15, 1.0, 1.0],
        'Quote Sell': [1.3, 1.25, 1.4, 1.4],
    }
)
df_desired

However, I'm having a hard time doing this in polars.

My first approach was splitting the data into parents and children, then joining them together on the respective ids:

df_parents = (
    df_original
    .filter(pl.col("Parent Order ID").is_null())
    .drop(columns=['Parent Order ID'])
)
df_ch =  (
    df_original
    .filter(pl.col("Parent Order ID").is_not_null())
    .drop(columns=['Price', 'Some Value'])
)

ch_buy = df_ch.filter(pl.col("Direction") == 'Buy').drop(columns=['Direction'])
ch_sell = df_ch.filter(pl.col("Direction") == 'Sell').drop(columns=['Direction'])

df_joined = (
    df_parents
    .join(ch_buy, left_on='Order ID', right_on='Parent Order ID', suffix="_Buy")
    .join(ch_sell, left_on='Order ID', right_on='Parent Order ID', suffix="_Sell")
    # The Name and Quote columns in the parent are all empty, so they can go, buy they had to be there for the suffix to work for the first join
    .drop(columns=[f'Name Provider {i}' for i in range(1, 5)])
    .drop(columns=[f'Quote Provider {i}' for i in range(1, 5)])
)

But this still leaves you with a mess where you somehow have to split this into four rows - not eight, as you could easily do with .unpivot(). Any tips on how to best approach this? Am I missing some obivous method here?

EDIT: Added a slightly larger example dataframe with two parent orders and their children (the real-world dataset has ~50k+ of those) :

df_original_two_orders = pl.DataFrame(
    {
        'Order ID': ['A', 'foo', 'bar', 'B', 'baz', 'rar'], # Two parent orders
        'Parent Order ID': [None, 'A', 'A', None, 'B', 'B'],
        'Direction': ["Buy", "Buy", "Sell", "Sell", "Sell", "Buy"], # Second parent has different direction
        'Price': [1.21003, None, 1.21003, 1.1384, None, 1.1384],
        'Some Value': [4, 4, 4, 42, 42, 42],
        'Name Provider 1': ['P8', 'P8', 'P8', 'P2', 'P2', 'P2'],
        'Quote Provider 1': [None, 1.1, 1.3, None, 1.10, 1.40], 
        # Above, 1.10 corresponds to Buy for order A for to Sell for order B - depends on Direction
        'Name Provider 2': ['P2', 'P2', 'P2', 'P1', 'P1', 'P1'],
        'Quote Provider 2': [None, 1.15, 1.25, None, 1.11, 1.39],
        'Name Provider 3': ['P1', 'P1', 'P1', 'P3', 'P3', 'P3'],
        'Quote Provider 3': [None, 1.0, 1.4, None, 1.05, 1.55],
        'Name Provider 4': ['P5', 'P5', 'P5', None, None, None],
        'Quote Provider 4': [None, 1.0, 1.4, None, None, None]
    }
)

I think this is slightly more representative of the real world in that it has multiple parent orders and not all provider columns are filled for all orders, while still keeping the annoying business logic far away.

The correct output for this example is the following:

df_desired_two_parents = pl.DataFrame(
    {
        'Order ID': ['A']*4 + ['B'] * 3,
        'Parent Direction': ['Buy']*4 + ['Sell'] * 3,
        'Price': [1.21003] * 4 + [1.1384] * 3,
        'Some Value': [4] * 4 + [42] * 3,
        'Name Provider': ['P8', 'P2', 'P1', 'P5', 'P2', 'P1', 'P3'],
        'Quote Buy': [1.1, 1.15, 1.0, 1.0, 1.40, 1.39, 1.55], # Note the last three values are the "second" values in the original column now because the parent order was 'Sell'
        'Quote Sell': [1.3, 1.25, 1.4, 1.4, 1.10, 1.11, 1.05],
    }
)

Solution

  • Here's how I've attempted it:

    fill the nulls in the Parent Order ID column and use that to .group_by()

    columns = ["Order ID", "Direction", "Price", "Some Value"]
    names   = pl.col("^Name .*$")   # All name columns
    quotes  = pl.col("^Quote .*$")  # All quote columns
    
    (
        df_original_two_orders
         .with_columns(pl.col("Parent Order ID").backward_fill())
         .group_by("Parent Order ID")
         .agg(
            pl.col(columns).first(),
            pl.concat_list(names.first()).alias("Name"),  # Put all names into single column:  ["Name1", "Name2", ...]
            pl.col("^Quote .*$").slice(1),                # Create list for each quote column (skip first row): [1.1, 1.3], [1.15, 1.25], ...
         )
         .with_columns(
            pl.concat_list(                               # Create list of Buy values
               pl.when(pl.col("Direction") == "Buy")
                 .then(quotes.list.first())
                 .otherwise(quotes.list.last())
                 .alias("Buy")
            ),
            pl.concat_list(                               # Create list of Sell values
               pl.when(pl.col("Direction") == "Sell")
                 .then(quotes.list.first())
                 .otherwise(quotes.list.last())
                 .alias("Sell")
            )
        )
        .select(columns + ["Name", "Buy", "Sell"])       # Remove Name/Quote [1234..] columns
        .explode("Name", "Buy", "Sell")                  # Turn into rows
    )
    
    shape: (8, 7)
    ┌──────────┬───────────┬─────────┬────────────┬──────┬──────┬──────┐
    │ Order ID ┆ Direction ┆ Price   ┆ Some Value ┆ Name ┆ Buy  ┆ Sell │
    │ ---      ┆ ---       ┆ ---     ┆ ---        ┆ ---  ┆ ---  ┆ ---  │
    │ str      ┆ str       ┆ f64     ┆ i64        ┆ str  ┆ f64  ┆ f64  │
    ╞══════════╪═══════════╪═════════╪════════════╪══════╪══════╪══════╡
    │ A        ┆ Buy       ┆ 1.21003 ┆ 4          ┆ P8   ┆ 1.1  ┆ 1.3  │
    │ A        ┆ Buy       ┆ 1.21003 ┆ 4          ┆ P2   ┆ 1.15 ┆ 1.25 │
    │ A        ┆ Buy       ┆ 1.21003 ┆ 4          ┆ P1   ┆ 1.0  ┆ 1.4  │
    │ A        ┆ Buy       ┆ 1.21003 ┆ 4          ┆ P5   ┆ 1.0  ┆ 1.4  │
    │ B        ┆ Sell      ┆ 1.1384  ┆ 42         ┆ P2   ┆ 1.4  ┆ 1.1  │
    │ B        ┆ Sell      ┆ 1.1384  ┆ 42         ┆ P1   ┆ 1.39 ┆ 1.11 │
    │ B        ┆ Sell      ┆ 1.1384  ┆ 42         ┆ P3   ┆ 1.55 ┆ 1.05 │
    │ B        ┆ Sell      ┆ 1.1384  ┆ 42         ┆ null ┆ null ┆ null │
    └──────────┴───────────┴─────────┴────────────┴──────┴──────┴──────┘
    

    Explanation

    Step 1 creates a list of names and puts each quote into a list:

    agg = (
        df_original_two_orders
         .with_columns(pl.col("Parent Order ID").backward_fill())
         .group_by("Parent Order ID")
         .agg(
            pl.col(columns).first(),
            pl.concat_list(names.first()).alias("Name"),  # Put all names into single column:  ["Name1", "Name2", ...]
            pl.col("^Quote .*$").slice(1),                # Create list for each quote column (skip first row): [1.1, 1.3], [1.15, 1.25], ...
         )
    )
    
    shape: (2, 10)
    ┌────────────────┬──────────┬───────────┬─────────┬───┬────────────────┬────────────────┬────────────────┬────────────────┐
    │ Parent Order   ┆ Order ID ┆ Direction ┆ Price   ┆ … ┆ Quote Provider ┆ Quote Provider ┆ Quote Provider ┆ Quote Provider │
    │ ID             ┆ ---      ┆ ---       ┆ ---     ┆   ┆ 1              ┆ 2              ┆ 3              ┆ 4              │
    │ ---            ┆ str      ┆ str       ┆ f64     ┆   ┆ ---            ┆ ---            ┆ ---            ┆ ---            │
    │ str            ┆          ┆           ┆         ┆   ┆ list[f64]      ┆ list[f64]      ┆ list[f64]      ┆ list[f64]      │
    ╞════════════════╪══════════╪═══════════╪═════════╪═══╪════════════════╪════════════════╪════════════════╪════════════════╡
    │ B              ┆ B        ┆ Sell      ┆ 1.1384  ┆ … ┆ [1.1, 1.4]     ┆ [1.11, 1.39]   ┆ [1.05, 1.55]   ┆ [null, null]   │
    │ A              ┆ A        ┆ Buy       ┆ 1.21003 ┆ … ┆ [1.1, 1.3]     ┆ [1.15, 1.25]   ┆ [1.0, 1.4]     ┆ [1.0, 1.4]     │
    └────────────────┴──────────┴───────────┴─────────┴───┴────────────────┴────────────────┴────────────────┴────────────────┘
    

    Step 2 creates separate Buy/Sell lists from the Quote columns.

    We can use pl.when().then().otherwise() to test if we should take the first/last value in each Quote list depending if the Direction is Buy/Sell.

    (
        agg
         .with_columns(
            pl.concat_list(                               # Create list of Buy values
               pl.when(pl.col("Direction") == "Buy")
                 .then(quotes.list.first())
                 .otherwise(quotes.list.last())
                 .alias("Buy")
            ),
            pl.concat_list(                               # Create list of Sell values
               pl.when(pl.col("Direction") == "Sell")
                 .then(quotes.list.first())
                 .otherwise(quotes.list.last())
                 .alias("Sell")
            )
         )
         .select(columns + ["Name", "Buy", "Sell"])
     )
    
    shape: (2, 7)
    ┌──────────┬───────────┬─────────┬────────────┬──────────────────────┬─────────────────────┬─────────────────────┐
    │ Order ID ┆ Direction ┆ Price   ┆ Some Value ┆ Name                 ┆ Buy                 ┆ Sell                │
    │ ---      ┆ ---       ┆ ---     ┆ ---        ┆ ---                  ┆ ---                 ┆ ---                 │
    │ str      ┆ str       ┆ f64     ┆ i64        ┆ list[str]            ┆ list[f64]           ┆ list[f64]           │
    ╞══════════╪═══════════╪═════════╪════════════╪══════════════════════╪═════════════════════╪═════════════════════╡
    │ B        ┆ Sell      ┆ 1.1384  ┆ 42         ┆ ["P2", "P1", … null] ┆ [1.4, 1.39, … null] ┆ [1.1, 1.11, … null] │
    │ A        ┆ Buy       ┆ 1.21003 ┆ 4          ┆ ["P8", "P2", … "P5"] ┆ [1.1, 1.15, … 1.0]  ┆ [1.3, 1.25, … 1.4]  │
    └──────────┴───────────┴─────────┴────────────┴──────────────────────┴─────────────────────┴─────────────────────┘
    

    Finally we .explode() to turn the lists into rows.

    You can add a .drop_nulls() afterwards to remove the null rows if desired.