Below I demonstrate a workflow for fitting the same model to many datasets in R by nesting datasets by test_id
, and then fitting the same model to each dataset, and extracting a statistic from each model.
My goal is to create the equivalent workflow in Python, using polars, but I will use pandas if necessary.
library(tidyverse)
SIMS <- 3
TRIALS <- 1e3
PROB_A <- .65
PROB_B <- .67
df <- bind_rows(
tibble(
recipe = "A",
trials = TRIALS,
events = rbinom(n=SIMS, size=trials, prob=PROB_A),
rate = events/trials) |>
mutate(test_id = 1:n()),
tibble(
recipe = "B",
trials = TRIALS,
events = rbinom(n=SIMS, size=trials, prob=PROB_B),
rate = events/trials) |>
mutate(test_id = 1:n())
)
df
df_nest <- df |>
group_by(test_id) |>
nest()
df_nest
Define two functions to map over my nested data:
glm_foo <- function(.data){
glm(formula = rate ~ recipe,
data = .data,
weights = trials,
family = binomial)
}
glm_foo(df_nest$data[[1]])
fit_and_extract <- function(.data){
m <- glm(formula = rate ~ recipe,
data = .data,
weights = trials,
family = binomial)
m$coefficients['recipeB']
}
fit_and_extract(df_nest$data[[1]])
df_nest |>
mutate(
model = map(.x = data, .f = glm_foo),
trt_b = map_dbl(.x = data, .f = fit_and_extract)
)
test_id data model trt_b
<int> <list> <list> <dbl>
1 <tibble> <S3: glm> 0.05606076
2 <tibble> <S3: glm> 0.11029236
3 <tibble> <S3: glm> 0.01304480
#Python Section
I can create the same nested data structure in polars, but I am unsure of how to fit the model to each nested dataset within the list column called data
.
import polars as pl
from polars import col
import numpy as np
SIMS = 3
TRIALS = int(1e3)
PROB_A = .65
PROB_B = .67
df_a = pl.DataFrame({
'recipe': "A",
'trials': TRIALS,
'events': np.random.binomial(n=TRIALS, p=PROB_A, size=SIMS),
'test_id': np.arange(SIMS)
})
df_b = pl.DataFrame({
'recipe': "B",
'trials': TRIALS,
'events': np.random.binomial(n=TRIALS, p=PROB_B, size=SIMS),
'test_id': np.arange(SIMS)
})
df = (pl.concat([df_a, df_b], rechunk=True)
.with_columns(
fails = col('trials') - col('events')
))
df
df_agg = df.group_by('test_id').agg(data = pl.struct('events', 'fails', 'recipe'))
df_agg.sort('test_id')
At this point my mental model of pandas starts to crumble. There are so many mapping options and I'm not really sure how to trouble shoot at this stage.
df_agg.with_columns(
(
pl.struct(["data"]).map_batches(
lambda x: smf.glm('events + fails ~ recipe', family=sm.families.Binomial(), data=x.struct.field('data').to_pandas()).fit()
)
).alias("model")
)
ComputeError: PatsyError: Error evaluating factor: TypeError: cannot use __getitem__
on Series of dtype List(Struct({'events': Int64, 'fails': Int64, 'recipe': String})) with argument 'recipe' of type 'str'
events + fails ~ recipe
smf.glm()
appears to want a Pandas DataFrame and the given formula can refer to column names.
data = pd.DataFrame(dict(events=[630], fails=[370], recipe=["A"]))
(smf.glm("events + fails ~ recipe", family=sm.families.Binomial(), data=data)
.fit().pvalues)
# Intercept 4.448408e-16
# dtype: float64
When you pass a struct through a UDF, you get a Series:
df.head(1).with_columns(pl.struct("events", "fails", "recipe").map_batches(print))
shape: (1,)
Series: "events" [struct[3]]
[
{630,370,"A"}
]
Series.struct.unnest()
will give you a DataFrame (which you can then call .to_pandas()
on)
shape: (1, 3)
┌────────┬───────┬────────┐
│ events ┆ fails ┆ recipe │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞════════╪═══════╪════════╡
│ 630 ┆ 370 ┆ A │
└────────┴───────┴────────┘
I don't know R but it seems the difference in this case is that with Polars, creating a List would be the final step in the order of operations.
Once you have a List, it's not as easy to execute such custom functionality for each element.
So we would execute the function at the .agg()
stage:
(df.group_by("test_id")
.agg(
data = pl.struct("events", "fails", "recipe"),
glm = pl.struct("events", "fails", "recipe").map_batches(lambda x: pl.Series(
smf.glm(
formula = "events + fails ~ recipe",
family = sm.families.Binomial(),
data = x.struct.unnest().to_pandas()).fit().pvalues
))
)
)
shape: (3, 3)
┌─────────┬────────────────────────────────┬────────────────────────┐
│ test_id ┆ data ┆ glm │
│ --- ┆ --- ┆ --- │
│ i64 ┆ list[struct[3]] ┆ list[f64] │
╞═════════╪════════════════════════════════╪════════════════════════╡
│ 1 ┆ [{655,345,"A"}, {674,326,"B"}] ┆ [5.5700e-22, 0.368276] │
│ 2 ┆ [{657,343,"A"}, {658,342,"B"}] ┆ [1.7233e-22, 0.962417] │
│ 0 ┆ [{630,370,"A"}, {657,343,"B"}] ┆ [4.4486e-16, 0.207569] │
└─────────┴────────────────────────────────┴────────────────────────┘
Polars doesn"t recognize the return value of glm()
but wrapping it with pl.Series()
inside the callback gives us a list[f64]
.
If you don't actually want to aggregate the DataFrame, you can use .over()
instead.
df.with_columns(
pl.struct("events", "fails", "recipe").map_batches(lambda x: pl.Series(
smf.glm(
formula = "events + fails ~ recipe",
family = sm.families.Binomial(),
data = x.struct.unnest().to_pandas()).fit().pvalues
))
.flatten()
.over("test_id")
.alias("glm")
)
shape: (6, 6)
┌────────┬────────┬────────┬─────────┬───────┬────────────┐
│ recipe ┆ trials ┆ events ┆ test_id ┆ fails ┆ glm │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i64 ┆ i64 ┆ i64 ┆ f64 │
╞════════╪════════╪════════╪═════════╪═══════╪════════════╡
│ A ┆ 1000 ┆ 630 ┆ 0 ┆ 370 ┆ 4.4486e-16 │
│ A ┆ 1000 ┆ 655 ┆ 1 ┆ 345 ┆ 5.5700e-22 │
│ A ┆ 1000 ┆ 657 ┆ 2 ┆ 343 ┆ 1.7233e-22 │
│ B ┆ 1000 ┆ 657 ┆ 0 ┆ 343 ┆ 0.207569 │
│ B ┆ 1000 ┆ 674 ┆ 1 ┆ 326 ┆ 0.368276 │
│ B ┆ 1000 ┆ 658 ┆ 2 ┆ 342 ┆ 0.962417 │
└────────┴────────┴────────┴─────────┴───────┴────────────┘