I'm subclassing pandas DataFrame
in a project of mine. Most pandas
operations preserve the subclass type, but df.groupby().agg()
does not. Is this a bug? Is there a known workaround?
import pandas as pd
class MySeries(pd.Series):
pass
class MyDataFrame(pd.DataFrame):
@property
def _constructor(self):
return MyDataFrame
_constructor_sliced = MySeries
MySeries._constructor_expanddim = MyDataFrame
df = MyDataFrame({"a": reversed(range(10)), "b": list('aaaabbbccc')})
print(type(df.groupby("b").sum()))
# <class '__main__.MyDataFrame'>
print(type(df.groupby("b").agg({"a": "sum"})))
# <class 'pandas.core.frame.DataFrame'>
It looks like there was an issue (described here) that fixed subclassing for df.groupby, but as far as I can tell df.groupby().agg() was missed. I'm using pandas version 2.0.3
.
The workaround I'm currently using is to re-initialize the subclassed DataFrame
and call the __finalize__
method, which propogates metadata to the new object.
MyDataFrame(my_df.groupby("b").agg({"a": "sum"})).__finalize__(other=my_df)
First, I've added a custom attribute to MyDataFrame
:
import pandas as pd
class MySeries(pd.Series):
_metadata = ['my_attr']
class MyDataFrame(pd.DataFrame):
_metadata = ['my_attr']
def __init__(
self,
data,
my_attr=None,
index=None,
columns=None,
dtype=None,
copy=None
):
self.my_attr = my_attr
super().__init__(data, index, columns, dtype, copy)
@property
def _constructor(self):
return MyDataFrame
_constructor_sliced = MySeries
MySeries._constructor_expanddim = MyDataFrame
Now we can check that subclass type and custom attributes are preserved:
my_df = MyDataFrame(
{"a": reversed(range(10)), "b": list('aaaabbbccc')},
my_attr='foo'
)
assert isinstance(my_df, MyDataFrame)
# Success!
assert isinstance(my_df.sample(3), MyDataFrame)
# Success!
assert isinstance(my_df.copy(), MyDataFrame)
# Success!
new_df = my_df.groupby("b").sum()
assert isinstance(new_df, MyDataFrame)
# Success! - fixed by issue linked in question
new_df = my_df.groupby("b").agg({"a": "sum"})
assert isinstance(new_df, MyDataFrame)
# AssertionError
assert new_df.my_attr == 'foo'
# AttributeError
new_df = my_df.groupby("b").agg({"a": "sum"})
new_df = MyDataFrame(new_df).__finalize__(other=my_df)
assert isinstance(new_df, MyDataFrame)
# Success!
assert new_df.my_attr == 'foo'
# Success!