I have some big dataset and I need to do multiple successive joins that are slow. I figured an alternative was to unpivot the whole dataframe I was merging successfully, join once and then get the data where I want with a mix of pl.coalesce and pl.join (Nested polars.col()).
For my use case, this is faster than the successive joins as the baskets grow bigger but I was wondering if there was a better (faster, more memory efficient) way to achieve this.
In real life, I may perform the following operations multiple times, on more features, more symbols and bigger baskets.
import polars as pl
from polars import col
from vega_datasets import data
df = pl.from_pandas(data.stocks())
list_symbols = df.select(col('symbol').unique()).to_series().to_list()
"""
[
"IBM",
"MSFT",
"AAPL",
"AMZN",
"GOOG"
]
"""
basket = pl.DataFrame(
{
"MSFT": ["AMZN", "GOOG"],
"AMZN": ["MSFT", "GOOG"],
"GOOG": ["AAPL", "IBM"],
"IBM": ["AMZN", "AAPL"],
"AAPL": ["AMZN", "IBM"],
}
).transpose(
include_header=True,
header_name="symbol",
column_names=["symbol_1", "symbol_2"],
)
(
df
.join(basket, on='symbol', how='left')
# I've put an iterative function to do the successive joins when needed, assume more than 2 joins in real life
.join(df.select('date','symbol', col('price').name.suffix('_1')),
left_on = ['date', 'symbol_1'],
right_on = ['date', 'symbol'],
how='left',
)
.join(df.select('date','symbol', col('price').name.suffix('_2')),
left_on = ['date', 'symbol_2'],
right_on = ['date', 'symbol'],
how='left',
)
)
df_pivot = (
# I normally use the lazyframe pivot implementation to wrok with lazyframe
df.pivot(index='date', on='symbol', values='price')
)
(
df.join(basket, on="symbol", how="left")
# alternative to the successive joins
.join(
df_pivot.select('date', pl.exclude('date').name.suffix('_price_to_drop')),
on="date",
how="left",
)
.with_columns(
*[
pl.coalesce(
pl.when(col(f'symbol_{i}')==symbol)
.then(col(f'{symbol}_price_to_drop'))
for symbol in list_symbols
).alias(f'price_{i}')
for i in [1,2]
]
)
.select(pl.exclude("^.*to_drop$"))
)
Note that I am usually working with lazyframes. This is not the case with the example above.
The best solution to avoid multiple join operations seems to be:
(
df.join(basket, on="symbol", how="left")
.with_columns(
pl.concat_list(['symbol_1', 'symbol_2']).alias('symbol_right')
)
.explode('symbol_right')
.join(df.select('date','symbol', col('price')),
left_on = ['date', 'symbol_right'],
right_on = ['date', 'symbol'],
how='left',
)
.group_by(['symbol', 'date'])
.agg(
pl.exclude('symbol_right','price_right').first(),
col('price_right').alias('price_list_to_drop')
)
.with_columns(
*[
col("price_list_to_drop")
.list.get(i)
.alias(f"price_{i+1}")
for i in [0, 1]
]
)
.select(pl.exclude("^.*to_drop$"))
)