pythonpandasdataframegroup-byaggregate

Pandas: Returning Blank Array after Groupby Aggregate


I have a dataframe:

data = {
  "Key": ["A1", "A2", np.nan, "A3", "A4"],
  "Name": ["Candy A", "Candy B", np.nan, "Candy C", "Candy D"],
  "Amout": [25, 50, np.nan, np.nan, 50],
  "Condition": ["Good", "Good", "Good", "Good", "Good"],
  "Packing": ["25 Nice", "49 Nice", "1 Damaged", "40 Nice", "50 Nice"],
  "Sunlight" : [np.nan, np.nan, np.nan, np.nan, "No Sunlight"]
}

df = pd.DataFrame(data)

print(df) 
   Key     Name  Amout Condition    Packing     Sunlight
0   A1  Candy A   25.0      Good    25 Nice          NaN
1   A2  Candy B   50.0      Good    49 Nice          NaN
2  NaN      NaN    NaN      Good  1 Damaged          NaN
3   A3  Candy C    NaN      Good    40 Nice          NaN
4   A4  Candy D   50.0      Good    50 Nice  No Sunlight

I tried to change the dataframe so that the dataframe more neatly.

def custom_agg(s):
                if pd.api.types.is_numeric_dtype(s):
                    return s.sum(min_count=1)
                s = s.dropna().drop_duplicates()
                if len(s) > 1:
                    return ', '.join(s.astype(str))
                return s

df = df.groupby(df['Key'].notna().cumsum(), as_index=False).agg(custom_agg)

print(df) 

In the column Sunlight, most of it contains blank array

Key     Name  Amout Condition             Packing     Sunlight
0  A1  Candy A   25.0      Good             25 Nice           []
1  A2  Candy B   50.0      Good  49 Nice, 1 Damaged           []
2  A3  Candy C    NaN      Good             40 Nice           []
3  A4  Candy D   50.0      Good             50 Nice  No Sunlight
{'index': [0, 1, 2, 3], 'columns': ['Key', 'Name', 'Amout', 'Condition', 'Packing', 'Sunlight'], 'data': [['A1', 'Candy A', 25.0, 'Good', '25 Nice', array([], dtype=object)], ['A2', 'Candy B', 50.0, 'Good', '49 Nice, 1 Damaged', array([], dtype=object)], ['A3', 'Candy C', nan, 'Good', '40 Nice', array([], dtype=object)], ['A4', 'Candy D', 50.0, 'Good', '50 Nice', 'No Sunlight']], 'index_names': [None], 'column_names': [None]}

I want the output to be NaN, no blank Array. I tried replace and mask but it didn't work. Any ideas?


Solution

  • I guess you can add a check in your custom_agg function that if s is empty series then return np.nan instead

    s = s.dropna().drop_duplicates()
    if len(s) == 0:
      return np.nan
    

    So you would get:

    import pandas as pd
    import numpy as np
    
    data = {
      "Key": ["A1", "A2", np.nan, "A3", "A4"],
      "Name": ["Candy A", "Candy B", np.nan, "Candy C", "Candy D"],
      "Amout": [25, 50, np.nan, np.nan, 50],
      "Condition": ["Good", "Good", "Good", "Good", "Good"],
      "Packing": ["25 Nice", "49 Nice", "1 Damaged", "40 Nice", "50 Nice"],
      "Sunlight" : [np.nan, np.nan, np.nan, np.nan, "No Sunlight"]
    }
    
    df = pd.DataFrame(data)
    
    print(df) 
    
    def custom_agg(s):
        if pd.api.types.is_numeric_dtype(s):
            return s.sum(min_count=1)
        s = s.dropna().drop_duplicates()
        if len(s) > 1:
            return ', '.join(s.astype(str))
        elif len(s) == 0:
            return np.nan
        else:
            return s
    
    df = df.groupby(df['Key'].notna().cumsum(), as_index=False).agg(custom_agg)
    
    print(df)
    

    The final print would output

      Key     Name  Amout Condition             Packing     Sunlight
    0  A1  Candy A   25.0      Good             25 Nice          NaN
    1  A2  Candy B   50.0      Good  49 Nice, 1 Damaged          NaN
    2  A3  Candy C    NaN      Good             40 Nice          NaN
    3  A4  Candy D   50.0      Good             50 Nice  No Sunlight