I am working on a bootstrapping project and need to sample M=N-1 observations with replacement where N is the number of unique observations in a specific group (defined by group_id). I need to figure out how to do this in polars. Any solutions?
Here's an example showing what I would like to accomplish:
# Have:
water_data = {
'group_id': [1,1,1,1,2,2,2,3,3,3,4,4,4,4,5,5,5],
'obs_id_within_group': [1,2,3,4,1,2,3,1,2,3,1,2,3,4,1,2,3],
'N': [4,4,4,4,3,3,3,3,3,3,4,4,4,4,3,3,3],
'M': [3,3,3,3,2,2,2,2,2,2,3,3,3,3,2,2,2],
'water_gallons': [12,23,21,11,10,10,10,23,24,25,27,30,17,12,11,14,20],
'water_source': ['lake','lake','pond','river','lake','glacier','glacier','lake','pond','river','lake','lake','pond','river','river','lake','glacier'],
'water_acidity': [3,4,5,1,2,4,3,2,3,3,4,6,7,8,8,3,1]
}
df=pl.DataFrame(water_data)
print(df)
# Want to randomly sample with replacement to:
sampled_water_data = {
'group_id': [1,1,1,2,2,3,3,4,4,4,5,5],
'obs_id_within_group': [1,2,2,3,3,3,2,4,1,1,2,1],
'N': [4,4,4,3,3,3,3,4,4,4,3,3],
'M': [3,3,3,2,2,2,2,3,3,3,2,2],
'water_gallons': [12,23,23,10,10,25,24,12,27,27,14,11],
'water_source': ['lake','lake','lake','glacier','glacier','river','pond','river','lake','lake','lake','river'],
'water_acidity': [3,4,4,3,3,3,3,8,4,4,5,8]
}
df_sampled=pl.DataFrame(sampled_water_data)
print(df_sampled)
Not sure how to sample a specific number from each group.
Group by group_id
, sample pl.len() - 1
rows with replacements, and finally explode on all columns except group_id
:
df2 = (
df.group_by("group_id")
.agg(pl.all().sample(pl.len() - 1, with_replacement=True))
.explode(pl.all().exclude("group_id"))
)
print(df2)
Output:
shape: (12, 7)
┌──────────┬─────────────────────┬─────┬─────┬───────────────┬──────────────┬───────────────┐
│ group_id ┆ obs_id_within_group ┆ N ┆ M ┆ water_gallons ┆ water_source ┆ water_acidity │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ str ┆ i64 │
╞══════════╪═════════════════════╪═════╪═════╪═══════════════╪══════════════╪═══════════════╡
│ 3 ┆ 2 ┆ 3 ┆ 2 ┆ 25 ┆ river ┆ 2 │
│ 3 ┆ 3 ┆ 3 ┆ 2 ┆ 25 ┆ lake ┆ 3 │
│ 5 ┆ 3 ┆ 3 ┆ 2 ┆ 14 ┆ river ┆ 3 │
│ 5 ┆ 3 ┆ 3 ┆ 2 ┆ 14 ┆ lake ┆ 8 │
│ 1 ┆ 2 ┆ 4 ┆ 3 ┆ 21 ┆ lake ┆ 3 │
│ 1 ┆ 2 ┆ 4 ┆ 3 ┆ 12 ┆ river ┆ 4 │
│ 1 ┆ 1 ┆ 4 ┆ 3 ┆ 12 ┆ lake ┆ 3 │
│ 4 ┆ 2 ┆ 4 ┆ 3 ┆ 17 ┆ pond ┆ 6 │
│ 4 ┆ 3 ┆ 4 ┆ 3 ┆ 30 ┆ river ┆ 7 │
│ 4 ┆ 1 ┆ 4 ┆ 3 ┆ 12 ┆ lake ┆ 8 │
│ 2 ┆ 2 ┆ 3 ┆ 2 ┆ 10 ┆ glacier ┆ 2 │
│ 2 ┆ 2 ┆ 3 ┆ 2 ┆ 10 ┆ lake ┆ 3 │
└──────────┴─────────────────────┴─────┴─────┴───────────────┴──────────────┴───────────────┘