Consider a dataframe:
animal animal2 size count
dog dog small 2
dog cat large 3
cat dog small 1
dog pig large 5
cat cat large 3
pig dog small 9
pig cat large 2
cat pig large 3
I want to build a 3 column Sankey plot that shows the flows between the categories (every line in the Sankey would be the number of times a pair of elements, each from one column, appear together).
This code I found seems to work, but it has a lot of self-loops, as I have similar categories in multiple columns:
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
# maximum of 6 value cols -> 6 colors
colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
labelList = []
colorNumList = []
for catCol in cat_cols:
labelListTemp = list(set(df[catCol].values))
colorNumList.append(len(labelListTemp))
labelList = labelList + labelListTemp
# remove duplicates from labelList
labelList = list(dict.fromkeys(labelList))
# define colors based on number of levels
colorList = []
for idx, colorNum in enumerate(colorNumList):
colorList = colorList + [colorPalette[idx]]*colorNum
# transform df into a source-target pair
for i in range(len(cat_cols)-1):
if i==0:
sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
sourceTargetDf.columns = ['source','target','count']
else:
tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
tempDf.columns = ['source','target','count']
sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
# add index for source-target pair
sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
# creating the sankey diagram
data = dict(
type='sankey',
node = dict(
pad = 15,
thickness = 20,
line = dict(
color = "black",
width = 1.5 # was 0.5
),
label = labelList,
color = colorList
),
link = dict(
source = sourceTargetDf['sourceID'],
target = sourceTargetDf['targetID'],
value = sourceTargetDf['count']
)
)
layout = dict(
title = title,
font = dict(
size = 20 # was 10
)
)
fig = dict(data=[data], layout=layout)
return fig
which can be run as:
import pandas as pd
import plotly
import chart_studio.plotly as py
fig = genSankey(df,cat_cols=['animal1','animal2','size'],value_cols='count',title='Animal List')
plotly.offline.plot(fig, validate=False)
Is there something I can simply change in this function to stop getting the self-loops?
'dog ' and 'dog' Adding an extra space should do the trick. Give it a shot.