I have a dataset with multiple ids, values, and dates (large T, large N).
import random
import numpy as np
import polars as pl
from datetime import datetime, timedelta
n_rows = 1000 # You can adjust this as needed
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 12, 31)
dates = [
start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
for _ in range(n_rows)
]
unique_ids = [random.randint(0, 100) for _ in range(n_rows)]
returns = [random.uniform(-0.01, 0.01) for _ in range(n_rows)]
data = pl.DataFrame({'date': dates, 'values': returns, 'ids': unique_ids})
data = data.with_columns(date=pl.col("date").dt.month_end())
Additionally, there are other datasets, SB_dates with random dates (in blocks) and newBL, a boolean array matrix of the same size as SB_dates.
SB_dates = pl.DataFrame({
'bd0': ['2009-03-31', '2010-05-31', '2011-03-31', '2001-12-31', '2000-06-30', '2015-03-31', '2013-01-31', '2018-01-31', '2020-08-31', '2021-04-30', '2015-12-31', '2002-05-31', '2016-12-31', '2021-09-30', '2001-02-28', '2022-02-28', '2015-04-30', '2016-11-30', '2021-05-31', '2021-10-31'],
'bd1': ['2014-06-30', '2016-11-30', '2009-11-30', '2004-10-31', '2011-06-30', '2022-09-30', '2011-01-31', '2005-12-31', '2001-01-31', '2014-05-31', '2006-03-31', '2002-02-28', '2021-07-31', '2019-12-31', '2000-07-31', '2021-12-31', '2017-09-30', '2000-06-30', '2021-09-30', '2007-06-30'],
'bd2': ['2018-12-31', '2015-08-31', '2016-07-31', '2003-05-31', '2017-03-31', '2009-11-30', '2017-04-30', '2005-04-30', '2008-06-30', '2018-10-31', '2018-04-30', '2013-02-28', '2013-07-31', '2020-02-29', '2015-05-31', '2002-04-30', '2020-02-29', '2011-04-30', '2004-07-31', '2021-10-31'],
'bd3': ['2006-03-31', '2023-05-31', '2002-01-31', '2020-04-30', '2005-05-31', '2002-01-31', '2022-04-30', '2002-12-31', '2013-10-31', '2002-08-31', '2015-08-31', '2000-06-30', '2009-05-31', '2013-10-31', '2014-07-31', '2012-04-30', '2010-08-31', '2016-03-31', '2019-01-31', '2005-10-31'],
'bd4': ['2006-01-31', '2010-01-31', '2001-03-31', '2011-04-30', '2021-01-31', '2017-04-30', '2023-01-31', '2010-09-30', '2017-04-30', '2015-03-31', '2023-05-31', '2006-12-31', '2004-06-30', '2005-05-31', '2007-06-30', '2004-04-30', '2003-10-31', '2016-08-31', '2003-12-31', '2000-03-31'],
'bd5': ['2010-09-30', '2010-10-31', '2012-01-31', '2018-03-31', '2011-07-31', '2022-03-31', '2019-03-31', '2003-03-31', '2001-05-31', '2000-04-30', '2022-03-31', '2014-06-30', '2015-05-31', '2021-03-31', '2010-05-31', '2020-11-30', '2012-05-31', '2011-02-28', '2008-03-31', '2023-07-31']
}).with_columns(pl.all().str.to_date())
newBL = pl.DataFrame({
'bn0': [True, False, False, False, True, True, True, True, True, False, True, True, True, False, True, False, False, False, False, False],
'bn1': [True, True, False, False, False, False, True, True, False, False, True, True, False, False, False, True, True, True, True, True],
'bn2': [True, False, False, False, False, False, True, False, True, True, True, False, False, False, True, True, True, False, True, True],
'bn3': [True, False, True, True, True, True, False, False, True, True, False, True, True, True, False, False, True, True, True, False],
'bn4': [True, False, True, False, False, True, False, False, True, False, False, False, True, True, True, False, False, False, False, False],
'bn5': [True, False, True, False, False, True, True, True, True, False, True, True, True, True, True, False, True, True, True, True]
})
For each new block, I need to select 5 (n) random ids and calculate the average values on that date. If it's not a new block, I should retain the same random ids that were previously selected.
I have have coded the function like this, using the data, SB_dates and newBL DataFrames provided.
def get_mean_chrono_polars(data, SB_dates, newBL, n=5):
n_rows, n_columns = SB_dates.shape
df_sb = pl.DataFrame()
for col in range(n_columns):
date_column = pl.DataFrame( SB_dates[:, col])
newBL_column = newBL[:, col]
mean_values_col = []
for i in range(n_rows):
filter_ids=(data
.select(pl.col("date",'values','ids'))
.filter(pl.col("date") == date_column[i,:]))
if newBL_column[i]:
random_ids=filter_ids.select(pl.col("ids").shuffle(seed=1)).limit(n)
selected_ids_df = (
filter_ids
.select(pl.col("date", 'values', 'ids'))
.filter(pl.col('ids').is_in(random_ids['ids']))
)
mean_values = selected_ids_df['values'].mean()
mean_values_col.append(mean_values)
mean_values_col=pl.Series(str(col),mean_values_col)
df_sb=df_sb.hstack([mean_values_col])
return df_sb
r = get_mean_chrono_polars(data, SB_dates, newBL, n=5)
shape: (20, 6)
┌───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ 0 ┆ 1 ┆ 2 ┆ 3 ┆ 4 ┆ 5 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0.00242 ┆ 0.001441 ┆ 0.00196 ┆ 0.003028 ┆ -0.001259 ┆ 0.000704 │
│ null ┆ 0.004338 ┆ null ┆ null ┆ null ┆ null │
│ null ┆ null ┆ null ┆ -0.000406 ┆ 0.000412 ┆ -0.000115 │
│ null ┆ null ┆ null ┆ -0.002489 ┆ null ┆ null │
│ -0.001194 ┆ null ┆ null ┆ -0.002647 ┆ null ┆ null │
│ … ┆ … ┆ … ┆ … ┆ … ┆ … │
│ null ┆ -0.000378 ┆ -0.000107 ┆ null ┆ null ┆ null │
│ null ┆ -0.001728 ┆ -0.001958 ┆ 0.000582 ┆ null ┆ -0.000583 │
│ null ┆ -0.001194 ┆ null ┆ 0.000622 ┆ null ┆ -0.002279 │
│ null ┆ 0.003631 ┆ 0.002787 ┆ -0.003758 ┆ null ┆ -0.002329 │
│ null ┆ 0.00516 ┆ 0.002487 ┆ null ┆ null ┆ -0.002492 │
└───────────┴───────────┴───────────┴───────────┴───────────┴───────────┘
Which works, but the problem is that I have two nested for-loops with a condition on the second one (conditional on another df).
How can I perform this in a more efficient manner?
unpivot
ing the two static dataframes, SB_dates
and newBL
, gives you all the data in column1, column2..., order, which is exactly what is needed for this problem to eliminate the first loop. (Even if not, a sort
could have been done to get it in this order.) Each element is uniquely identified by index
and the variable
column (renamed to col
) in this newly reshaped DataFrame.data
can do all the filter_ids = ...
step.group_by
further readies "choose n
random IDs on that day" step.maintain_order=True
, we further maintain the original order from the left join, and thus each element of the final returned DataFrame will correctly correspond to each element of SB_dates
and newBL
.pl.when
can be used in the agg
regation to null out any unneeded calculations.values
after the agg
regation, the ids
were just needed to pair with the corresponding values
in the sample
. Now we discard everything but the values
.values
list with list.to_struct
, then unnest
is needed to avoid lists of nulls / struct of nulls - just a null
per new column if newBL
is False
.strategy
of forward
is used on fill_nulls
at this time.That gets you the mean_val
s in a one-column dataframe, which can optionally be reshape
d back into the original SB_dates
shape if desired.
def get_mean_chrono_polars(data, SB_dates, newBL, n=5):
static_data = (
SB_dates.with_row_index()
.unpivot(index='index', variable_name='col', value_name='date')
.with_columns(newBL.unpivot().select(bl='value'))
)
return (
static_data.join(data, on=pl.col('date').dt.date(), how='left')
.group_by('index', 'col', maintain_order=True)
.agg(pl.when(pl.col('bl')).then(pl.col('ids', 'values').sample(n)))
.select(
pl.col('values').list.to_struct(
n_field_strategy='max_width',
fields=[f'val{j}' for j in range(n)],
upper_bound=n,
),
)
.unnest('values')
.select(
mean_val=(pl.sum_horizontal(pl.all()) / n)
.fill_null(strategy='forward')
# steps from this point are optional
.reshape(SB_dates.shape)
.arr.to_struct()
)
.unnest('mean_val')
)