pythonnodesplotly-pythonsankey-diagramcartesian-coordinates

Plotly.py Sankey Diagrams - Controlling Node Destination


I have a similar issue to a question previously posted:

Plotly: How to set node positions in a Sankey Diagram?

..In which I need to get all my values which end in the same character to align in the same vertical column in my Sankey Diagram (There are three vertical columns in total, and I want (A) in the first, (B) in the second, and (C) in the third). There was an answer to this previous posting providing a custom function to assign nodes ending in the same character to the same destination, which I have modified to fit my dataset, as below:

# Extract list of nodes and list of Source / Target links from my_df DataFrame 

all_nodes = my_df.Source.values.tolist() + my_df.Target.values.tolist()
values = my_df.Value.values.tolist()
source_indices = [all_nodes.index(source) for source in my_df.Source]
target_indices = [all_nodes.index(target) for target in my_df.Target] 
label_names = all_nodes + my_df.Value.values.tolist()
print (label_names)

# Function to assign identical x-positions to label names that have a common ending ((A),(B),(C))

def nodify (node_names):
    node_names = all_nodes 
    # unique name endings 
    ends = sorted(list(set([e[-2] for e in node_names])))
    #intervals 
    steps = 0.5
    # x-values for each unique name ending for input as node position 
    nodes_x = {}
    xVal = 0.5
    for e in ends: 
        nodes_x[str(e)] = xVal
        xVal += steps 
        
    #x and y values in list form
    x_values = [nodes_x[n[-2]] for n in node_names]
    y_values = []
    y_val = 0
    for n in node_names:
        y_values.append(y_val)
        y_val+=.001
    return x_values, y_values 

nodified = nodify(node_names=all_nodes)

# Plot the Sankey Diagram from my_df with node destination control 

fig = go.Figure(data=[go.Sankey(
      arrangement='snap',
      node = dict(
      pad = 8,
      thickness = 10,
      line = dict(color = "black", width = 0.5),
      label = all_nodes,
      color = "blue",
     x=nodified[0],
     y=nodified[1]
    ),

    # Add links
    link = dict(
      source =  source_indices,
      target =  target_indices,
      value =  my_df.Value,
))])

fig.update_layout(title_text= "My Title",
                  font_size=10,
                  autosize=True,
                  height = 2000,
                  width = 2000
                 )
fig.show()

The destination assignment was not working for me at all, until I found an open GitHub issue (#3002) which indicated that Plotly does not like x and y coordinates set at 0, so I changed 'XVal' to start at 0.5 rather than 0, which snapped node destination mostly into place, with the exception of four (B) values still ending in the (C) column.

Is there anything I'm missing about the Plotly coordinate system or node destination in general that could help me understand why Plotly is continually overriding my node destination assignment for a handful of the total nodes?

Sample DataFrame:

0   1(A)    11(B)   6
1   1(A)    12(B)   2
2   1(A)    13(B)   20
3   1(A)    14(B)   1
4   1(A)    15(B)   1
5   1(A)    2(B)    17
6   1(A)    16(B)   5
7   1(A)    17(B)   9
8   1(A)    18(B)   6
9   1(A)    19(B)   5
10  1(A)    20(B)   255
11  1(A)    21(B)   1
12  1(A)    22(B)   9
13  1(A)    3(B)    200
14  1(A)    23(B)   1
15  1(A)    4(B)    1035
16  1(A)    24(B)   14
17  1(A)    25(B)   20
18  1(A)    26(B)   2
19  1(A)    27(B)   222
20  1(A)    28(B)   8
21  1(A)    29(B)   44
22  1(A)    5(B)    3
23  1(A)    6(B)    1529
24  1(A)    30(B)   1
25  1(A)    31(B)   2
26  1(A)    7(B)    6
27  1(A)    32(B)   1
28  1(A)    8(B)    10
29  1(A)    33(B)   11
30  1(A)    34(B)   35
31  1(A)    35(B)   1
32  1(A)    36(B)   41
33  1(A)    37(B)   6
34  1(A)    38(B)   4
35  1(A)    39(B)   2
36  1(A)    40(B)   68
37  1(A)    41(B)   46
38  1(A)    42(B)   24
39  1(A)    9(B)    21
40  1(A)    10(B)   13
41  1(A)    43(B)   6
42  2(B)    44(C)   12
43  3(B)    45(C)   19
44  4(B)    46(C)   1
45  5(B)    47(C)   6
46  6(B)    46(C)   2
47  6(B)    48(C)   1
48  6(B)    49(C)   1
49  7(B)    50(C)   84
50  8(B)    51(C)   2
51  9(B)    46(C)   4
52  10(B)   52(C)   2
53  10(B)   52(C)   2
54  10(B)   53(C)   8
55  10(B)   53(C)   8
56  10(B)   53(C)   12
57  10(B)   53(C)   20
58  10(B)   53(C)   10
59  10(B)   53(C)   4

Any help is appreciated!


Solution

  • import pandas as pd
    import numpy as np
    import plotly.graph_objects as go
    import itertools
    
    S = 40
    labels = [str(p + 1) + s for s, p in itertools.product(list("ABC"), range(5))]
    df = pd.DataFrame(
        {
            "source": np.random.choice(labels, S),
            "target": np.random.choice(labels, S),
            "value": np.random.randint(1, 10, S),
        }
    )
    # make sure paths are valid...
    df = df.loc[df["source"].str[-1].apply(ord) < df["target"].str[-1].apply(ord)]
    df = df.groupby(["source", "target"], as_index=False).sum()
    
    
    def factorize(s):
        a = pd.factorize(s, sort=True)[0]
        return (a + 0.01) / (max(a) + 0.1)
    
    
    # unique nodes
    nodes = np.unique(df[["source", "target"]], axis=None)
    nodes = pd.Series(index=nodes, data=range(len(nodes)))
    # work out positioning of nodes
    nodes = (
        nodes.to_frame("id")
        .assign(
            x=lambda d: factorize(d.index.str[-1]),
            y=lambda d: factorize(d.index.str[:-1]),
        )
    )
    
    # now simple job of building sankey
    fig = go.Figure(
        go.Sankey(
            arrangement="snap",
            node={"label": nodes.index, "x": nodes["x"], "y": nodes["y"]},
            link={
                "source": nodes.loc[df["source"], "id"],
                "target": nodes.loc[df["target"], "id"],
                "value": df["value"],
            },
        )
    )
    
    fig
    

    enter image description here

    generated data

    source target value
    1A 3C 7
    1B 1C 5
    1B 3C 6
    2A 4B 12
    2B 2C 8
    3A 3C 1
    3B 1C 8
    3B 3C 10
    4A 1B 5
    4B 2C 9
    4B 3C 8
    4B 4C 3
    5A 1B 1
    5A 2C 9
    5A 5B 4

    using sample data now in question

    # work out positioning of nodes
    nodes = (
        nodes.to_frame("id")
        .assign(
            x=lambda d: factorize(d.index.str[-3:]),
            y=lambda d: factorize(d.index.str[:-3]),
        )
    )