performancefor-loopif-statementvectorizationpython-polars

How to replace a conditional for loop with a more efficient approach using Polars?


I have a dataset with multiple ids, values, and dates (large T, large N).

import random
import numpy as np
import polars as pl
from datetime import datetime, timedelta
     
n_rows = 1000  # You can adjust this as needed
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 12, 31)

dates = [
    start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    for _ in range(n_rows)
]

unique_ids = [random.randint(0, 100) for _ in range(n_rows)]
returns = [random.uniform(-0.01, 0.01) for _ in range(n_rows)]

data = pl.DataFrame({'date': dates, 'values': returns, 'ids': unique_ids})
data = data.with_columns(date=pl.col("date").dt.month_end())

Additionally, there are other datasets, SB_dates with random dates (in blocks) and newBL, a boolean array matrix of the same size as SB_dates.

SB_dates = pl.DataFrame({
    'bd0': ['2009-03-31', '2010-05-31', '2011-03-31', '2001-12-31', '2000-06-30', '2015-03-31', '2013-01-31', '2018-01-31', '2020-08-31', '2021-04-30', '2015-12-31', '2002-05-31', '2016-12-31', '2021-09-30', '2001-02-28', '2022-02-28', '2015-04-30', '2016-11-30', '2021-05-31', '2021-10-31'],
    'bd1': ['2014-06-30', '2016-11-30', '2009-11-30', '2004-10-31', '2011-06-30', '2022-09-30', '2011-01-31', '2005-12-31', '2001-01-31', '2014-05-31', '2006-03-31', '2002-02-28', '2021-07-31', '2019-12-31', '2000-07-31', '2021-12-31', '2017-09-30', '2000-06-30', '2021-09-30', '2007-06-30'],
    'bd2': ['2018-12-31', '2015-08-31', '2016-07-31', '2003-05-31', '2017-03-31', '2009-11-30', '2017-04-30', '2005-04-30', '2008-06-30', '2018-10-31', '2018-04-30', '2013-02-28', '2013-07-31', '2020-02-29', '2015-05-31', '2002-04-30', '2020-02-29', '2011-04-30', '2004-07-31', '2021-10-31'],
    'bd3': ['2006-03-31', '2023-05-31', '2002-01-31', '2020-04-30', '2005-05-31', '2002-01-31', '2022-04-30', '2002-12-31', '2013-10-31', '2002-08-31', '2015-08-31', '2000-06-30', '2009-05-31', '2013-10-31', '2014-07-31', '2012-04-30', '2010-08-31', '2016-03-31', '2019-01-31', '2005-10-31'],
    'bd4': ['2006-01-31', '2010-01-31', '2001-03-31', '2011-04-30', '2021-01-31', '2017-04-30', '2023-01-31', '2010-09-30', '2017-04-30', '2015-03-31', '2023-05-31', '2006-12-31', '2004-06-30', '2005-05-31', '2007-06-30', '2004-04-30', '2003-10-31', '2016-08-31', '2003-12-31', '2000-03-31'],
    'bd5': ['2010-09-30', '2010-10-31', '2012-01-31', '2018-03-31', '2011-07-31', '2022-03-31', '2019-03-31', '2003-03-31', '2001-05-31', '2000-04-30', '2022-03-31', '2014-06-30', '2015-05-31', '2021-03-31', '2010-05-31', '2020-11-30', '2012-05-31', '2011-02-28', '2008-03-31', '2023-07-31']
}).with_columns(pl.all().str.to_date())

newBL = pl.DataFrame({
    'bn0': [True, False, False, False, True, True, True, True, True, False, True, True, True, False, True, False, False, False, False, False],
    'bn1': [True, True, False, False, False, False, True, True, False, False, True, True, False, False, False, True, True, True, True, True],
    'bn2': [True, False, False, False, False, False, True, False, True, True, True, False, False, False, True, True, True, False, True, True],
    'bn3': [True, False, True, True, True, True, False, False, True, True, False, True, True, True, False, False, True, True, True, False],
    'bn4': [True, False, True, False, False, True, False, False, True, False, False, False, True, True, True, False, False, False, False, False],
    'bn5': [True, False, True, False, False, True, True, True, True, False, True, True, True, True, True, False, True, True, True, True]
})

For each new block, I need to select 5 (n) random ids and calculate the average values on that date. If it's not a new block, I should retain the same random ids that were previously selected.

I have have coded the function like this, using the data, SB_dates and newBL DataFrames provided.

def get_mean_chrono_polars(data, SB_dates, newBL, n=5):
    n_rows, n_columns = SB_dates.shape
    df_sb = pl.DataFrame()

    for col in range(n_columns):
        date_column = pl.DataFrame( SB_dates[:, col])
        newBL_column = newBL[:, col]
        mean_values_col = []

        for i in range(n_rows):                        
            filter_ids=(data
                             .select(pl.col("date",'values','ids'))
                             .filter(pl.col("date") == date_column[i,:]))            
            if newBL_column[i]:
                random_ids=filter_ids.select(pl.col("ids").shuffle(seed=1)).limit(n)

            selected_ids_df = (
                filter_ids
                .select(pl.col("date", 'values', 'ids'))
                .filter(pl.col('ids').is_in(random_ids['ids']))
            )
            
            mean_values = selected_ids_df['values'].mean()
            mean_values_col.append(mean_values)
        mean_values_col=pl.Series(str(col),mean_values_col)        
        df_sb=df_sb.hstack([mean_values_col])
    return df_sb


r = get_mean_chrono_polars(data, SB_dates, newBL, n=5)
shape: (20, 6)
┌───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ 0         ┆ 1         ┆ 2         ┆ 3         ┆ 4         ┆ 5         │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0.00242   ┆ 0.001441  ┆ 0.00196   ┆ 0.003028  ┆ -0.001259 ┆ 0.000704  │
│ null      ┆ 0.004338  ┆ null      ┆ null      ┆ null      ┆ null      │
│ null      ┆ null      ┆ null      ┆ -0.000406 ┆ 0.000412  ┆ -0.000115 │
│ null      ┆ null      ┆ null      ┆ -0.002489 ┆ null      ┆ null      │
│ -0.001194 ┆ null      ┆ null      ┆ -0.002647 ┆ null      ┆ null      │
│ …         ┆ …         ┆ …         ┆ …         ┆ …         ┆ …         │
│ null      ┆ -0.000378 ┆ -0.000107 ┆ null      ┆ null      ┆ null      │
│ null      ┆ -0.001728 ┆ -0.001958 ┆ 0.000582  ┆ null      ┆ -0.000583 │
│ null      ┆ -0.001194 ┆ null      ┆ 0.000622  ┆ null      ┆ -0.002279 │
│ null      ┆ 0.003631  ┆ 0.002787  ┆ -0.003758 ┆ null      ┆ -0.002329 │
│ null      ┆ 0.00516   ┆ 0.002487  ┆ null      ┆ null      ┆ -0.002492 │
└───────────┴───────────┴───────────┴───────────┴───────────┴───────────┘

Which works, but the problem is that I have two nested for-loops with a condition on the second one (conditional on another df).

How can I perform this in a more efficient manner?


Solution

  • That gets you the mean_vals in a one-column dataframe, which can optionally be reshaped back into the original SB_dates shape if desired.

    def get_mean_chrono_polars(data, SB_dates, newBL, n=5):
        static_data = (
            SB_dates.with_row_index()
            .unpivot(index='index', variable_name='col', value_name='date')
            .with_columns(newBL.unpivot().select(bl='value'))
        )
    
        return (
            static_data.join(data, on=pl.col('date').dt.date(), how='left')
            .group_by('index', 'col', maintain_order=True)
            .agg(pl.when(pl.col('bl')).then(pl.col('ids', 'values').sample(n)))
            .select(
                pl.col('values').list.to_struct(
                    n_field_strategy='max_width',
                    fields=[f'val{j}' for j in range(n)],
                    upper_bound=n,
                ),
            )
            .unnest('values')
            .select(
                mean_val=(pl.sum_horizontal(pl.all()) / n)
                .fill_null(strategy='forward')
            # steps from this point are optional
                .reshape(SB_dates.shape)
                .arr.to_struct()
            )
            .unnest('mean_val')
        )