pythonpandasdataframecrosstab

Count combinations of values using pandas crosstab


I would like to count the combinations of values (pets) per group (user). The code below gives the desired result. However, I'm looking for a more 'pandamic' way, maybe by using the crosstab method. Any suggestions for a less verbose solution?

import pandas as pd
import numpy as np
import itertools
df1 = pd.DataFrame({'user':['Jane', 'Matthew', 'Emily'], 'pets':[['dog', 'cat', 'lizard'], ['dog', 'spider'], ['dog', 'cat', 'monkey']]}).explode('pets')
combinations = []
for g in df1.groupby('user'): combinations += [x for x in itertools.combinations(g[1].pets, 2)]
df2 = pd.DataFrame(np.zeros((df1.pets.nunique(), df1.pets.nunique()), dtype=int), columns=df1.pets.unique(), index=df1.pets.unique())
for x in combinations:
    df2.at[x[0], x[1]] += 1
    df2.at[x[1], x[0]] += 1
print(df2)

Result:

        dog  cat  lizard  spider  monkey
dog       0    2       1       1       1
cat       2    0       1       0       1
lizard    1    1       0       0       0
spider    1    0       0       0       0
monkey    1    1       0       0       0

Solution

  • Use DataFrame.merge with crosstab:

    df = df1.merge(df1, on='user')
    df = pd.crosstab(df.pets_x, df.pets_y).rename_axis(index=None, columns=None)
    print(df)
    
            cat  dog  lizard  monkey  spider
    cat       2    2       1       1       0
    dog       2    3       1       1       1
    lizard    1    1       1       0       0
    monkey    1    1       0       1       0
    spider    0    1       0       0       1
    

    If need set values in diagonal to 0 add numpy.fill_diagonal:

    df = df1.merge(df1, on='user')
    
    df = pd.crosstab(df.pets_x, df.pets_y).rename_axis(index=None, columns=None)
    np.fill_diagonal(df.to_numpy(), 0)
    print (df)
            cat  dog  lizard  monkey  spider
    cat       0    2       1       1       0
    dog       2    0       1       1       1
    lizard    1    1       0       0       0
    monkey    1    1       0       0       0
    spider    0    1       0       0       0
    
    import itertools
    combinations = []
    for g in df1.groupby('user'): combinations += [x for x in itertools.combinations(g[1].pets, 2)]
    df2 = pd.DataFrame(np.zeros((df1.pets.nunique(), df1.pets.nunique()), dtype=int), columns=df1.pets.unique(), index=df1.pets.unique())
    for x in combinations:
        df2.at[x[0], x[1]] += 1
        df2.at[x[1], x[0]] += 1
    print(df2)
            dog  cat  lizard  spider  monkey
    dog       0    2       1       1       1
    cat       2    0       1       0       1
    lizard    1    1       0       0       0
    spider    1    0       0       0       0
    monkey    1    1       0       0       0