pythonpandasdataframe

Finding count of unique sequence made up of values spanned over several columns in Python


A contains columns x, y1, y2, y3 & y4. I am interested in studying the {y1,y2,y3,y4} sequence w.r.t x. To find unique {y1,y2,y3,y4} sequence occurring for each x, I do the following:

B = pd.DataFrame()
for x_temp in A['x'].unique():
    B = pd.concat([B, A[A['x'] == x_temp][['x','y1','y2','y3','y4']]])
B = B.drop_duplicates().sort_values(by=['x','y1','y2','y3','y4'])
del x_temp 

I want to introduce a new column called 'count' in B, which contains # of unique {y1,y2,y3,y4} that occurred for that specific x in A.

B['count'] = A.apply(lambda row: (A['y1'] == row['y1']) & (A['y2'] == row['y2']) & (A['y3'] == row['y3']) & (A['y4'] == row['y4']), axis=1).sum()

This works, however, it doesn't work if A or B has missing values. I want it to treat missing values also as a unique value.

Example:

A = pd.DataFrame({'x':['1','1','1','1','2','2','2','2','2','1'],
                   'y1':['1','2','2',np.nan,'2',np.nan,'2','2','2','1'],
                   'y2':['2','1','2','2',np.nan,np.nan,'2','2','1','2'],
                   'y3':['1','1',np.nan,'2',np.nan,'2',np.nan,np.nan,'1','1'],
                   'y4':['2','2','2',np.nan,np.nan,'1','2','2','2','2']})

B = pd.DataFrame()
for x_temp in A['x'].unique():
    B = pd.concat([B, A[A['x'] == x_temp][['x','y1','y2','y3','y4']]])
B = B.drop_duplicates().sort_values(by=['x','y1','y2','y3','y4'])
del x_temp

B['count'] = A.apply(lambda row: (A['y1'] == row['y1']) & (A['y2'] == row['y2']) & (A['y3'] == row['y3']) & (A['y4'] == row['y4']), axis=1).sum()
print(B)

   x   y1   y2   y3   y4  count
0  1    1    2    1    2      2
1  1    2    1    1    2      2
2  1    2    2  NaN    2      0
3  1  NaN    2    2  NaN      0
8  2    2    1    1    2      2
6  2    2    2  NaN    2      0
4  2    2  NaN  NaN  NaN      0
5  2  NaN  NaN    2    1      0

Solution

  • Assuming you want to count the values and then get the sum across the different xs, you could use:

    cols = ['x','y1','y2','y3','y4']
    
    out = (A[cols].value_counts(dropna=False, sort=False)
           .reset_index(name='count')
           .sort_values(by=cols, na_position='last')
           .assign(count=lambda x: x.groupby(cols[1:], dropna=False)
                                   ['count'].transform('sum')
                  )
          )
    

    Output:

       x   y1   y2   y3   y4  count
    0  1    1    2    1    2      2
    1  1    2    1    1    2      2
    2  1    2    2  NaN    2      3
    3  1  NaN    2    2  NaN      1
    4  2    2    1    1    2      2
    5  2    2    2  NaN    2      3
    6  2    2  NaN  NaN  NaN      1
    7  2  NaN  NaN    2    1      1
    

    If you want to set the rows with NaNs as 0:

    cols = ['x','y1','y2','y3','y4']
    
    out = (A[cols].value_counts(dropna=False, sort=False)
           .reset_index(name='count')
           .sort_values(by=cols, na_position='last')
           .assign(count=lambda x: x.groupby(cols[1:])
                                   ['count'].transform('sum')
                                   .fillna(0).convert_dtypes()
                  )
          )
    

    Output:

       x   y1   y2   y3   y4  count
    0  1    1    2    1    2      2
    1  1    2    1    1    2      2
    2  1    2    2  NaN    2      0
    3  1  NaN    2    2  NaN      0
    4  2    2    1    1    2      2
    5  2    2    2  NaN    2      0
    6  2    2  NaN  NaN  NaN      0
    7  2  NaN  NaN    2    1      0
    

    If you don't want to sum across the xs:

    cols = ['x','y1','y2','y3','y4']
    
    out = (A[cols].value_counts(dropna=False, sort=False)
           .reset_index(name='count')
           .sort_values(by=cols, na_position='last')
          )
    

    Output:

       x   y1   y2   y3   y4  count
    0  1    1    2    1    2      2
    1  1    2    1    1    2      1
    2  1    2    2  NaN    2      1
    3  1  NaN    2    2  NaN      1
    4  2    2    1    1    2      1
    5  2    2    2  NaN    2      2
    6  2    2  NaN  NaN  NaN      1
    7  2  NaN  NaN    2    1      1