rggplot2network-programmingsankey-diagramggalluvial

Create Flow chart in R


I am trying to make either a multi node Sankey or Alluvial plot whichever is more appropriate.

The output would be similar to this which is from the ggalluvial packgage vignette here gg alluvial vignette

enter image description here

The difference would be that my time_period would be on the x axis and my source column would replace the survey responses. I also tried the Sankey plot from the networkD3 package for a single time period to get results similar to below from the vignette here Sankey Plot

enter image description here

Which would be a good compromise if I cant visualize all the time periods but it did not work either. My sample data and code is below. Thanks

Data

  dat = structure(list(time_period = c("1 -> 2", "1 -> 2", "1 -> 2", 
"1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", 
"1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "2 -> 3", 
"2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", 
"2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", 
"2 -> 3", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", 
"3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", 
"3 -> 4", "3 -> 4", "3 -> 4", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", 
"4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", 
"4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5"), source = c("A", 
"A", "A", "A", "B", "B", "B", "B", "C", "C", "C", "C", "D", "D", 
"D", "D", "A", "A", "A", "A", "B", "B", "B", "B", "C", "C", "C", 
"C", "D", "D", "D", "D", "A", "A", "A", "A", "B", "B", "B", "B", 
"C", "C", "C", "C", "D", "D", "D", "D", "A", "A", "A", "A", "B", 
"B", "B", "B", "C", "C", "C", "C", "D", "D", "D", "D"), target = c("A", 
"B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", 
"C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", 
"D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", 
"A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", 
"B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D"), count = c(200573, 
27490, 869, 11330, 22136, 208721, 243, 921, 1552, 266, 97647, 
489, 9644, 743, 491, 62900, 179754, 23188, 1111, 9760, 27824, 
193337, 228, 769, 858, 159, 83213, 330, 10410, 869, 474, 54946, 
188765, 30850, 973, 9485, 22181, 196101, 218, 1012, 1482, 292, 
91553, 392, 9989, 724, 431, 50766, 201313, 25308, 1095, 10801, 
25842, 206138, 246, 836, 1199, 210, 94152, 362, 8414, 624, 457, 
55365)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-64L))

Code:

library(networkD3)
library(tidyverse)

  
 dat  = dat %>% filter(time_period == '4 -> 5') 
 
 
 nodes <- data.frame(name=c(as.character(dat$source), as.character(dat$target)) %>% unique())
  
 
 dat$IDsource=match(dat$source, nodes$name)-1 
 dat$IDtarget=match(dat$target, nodes$name)-1
  
      
  # Make the Network
  sankeyNetwork(Links = dat, Nodes = nodes,
                Source = "IDsource", Target = "IDtarget",
                Value = "count", NodeID = "name", 
                sinksRight=FALSE, nodeWidth=40, fontSize=13, nodePadding=20)

Solution

  • I gave it a go using ggalluvial. Each section is plotted quite independently. The height difference between borders can be fixed by adding empty rows.

    library(ggalluvial)
    library(dplyr)
    
    head(dat)
    # time_period source target  count
    #  <chr>       <chr>  <chr>   <dbl>
    #1 1 -> 2      A      A      200573
    #2 1 -> 2      A      B       27490
    #3 1 -> 2      A      C         869
    #4 1 -> 2      A      D       11330
    #5 1 -> 2      B      A       22136
    #6 1 -> 2      B      B      208721
    
    # re-format into long data format
    dat2 = NULL
    for (i in 1:4) {
      tem <- dat %>%
        filter(time_period == paste0(i," -> ", i+1)) %>%
        {data.frame(
          alluvium = paste(i, c(seq(nrow(.)), 
                                seq(nrow(.)))), # same alluvium for each pair
          time = c(rep(i+ifelse(i>1,0.0001,0), nrow(.)), 
                   rep(i+1, nrow(.))), # +0.0001 to avoid overlapping with previous section
          count = c(.$count, .$count),
          group = c(.$source, .$target)
        )
        }
      dat2 <- rbind(dat2, tem)
    }
    
    # padding missing values to match borders between sections
    for (i in 1:4) {
      # right side missing
      if (i < 4) {
        count_this <- dat %>%
          filter(time_period == paste0(i," -> ", i+1)) %>%
          group_by(target) %>%
          summarise(count = sum(count)) %>%
          .$count
        
        count_right <- dat %>%
          filter(time_period == paste0(i+1," -> ", i+2)) %>%
          group_by(source) %>%
          summarise(count = sum(count)) %>%
          .$count
        
        count_max <- pmax(count_this, count_right)
        
        dat2 = dat2 %>%
          rbind(., data.frame(
            alluvium = paste(i, "right_missing", seq(4)),
            time = c(rep(i+1, 4)),
            count = count_max - count_this, # padding if less count than the right side
            group = c("A", "B", "C", "D")
          )
          )
      }
    
      # left side missing
      if (i > 1) {
        count_left <- dat %>%
          filter(time_period == paste0(i-1," -> ", i)) %>%
          group_by(target) %>%
          summarise(count = sum(count)) %>%
          .$count
        
        count_this <- dat %>%
          filter(time_period == paste0(i," -> ", i+1)) %>%
          group_by(source) %>%
          summarise(count = sum(count)) %>%
          .$count
        
        count_max <- pmax(count_left, count_this)
        
        dat2 = dat2 %>%
          rbind(., data.frame(
            alluvium = paste(i, "left_missing", seq(4)),
            time = c(rep(i+0.0001, 4)),
            count = count_max - count_this,  # padding if less count than the left side
            group = c("A", "B", "C", "D")
          )
          )
      }
    }
    
    dat2 %>%
      filter(count>0) %>%
      ggplot(aes(x = time, y = count, stratum = group, alluvium = alluvium)) +
      geom_flow(aes(fill = group)) +
      geom_stratum(data = dat2[dat2$time%%1==0,], aes(fill = group), alpha = 0.3) +
      geom_text(data = dat2[dat2$time%%1==0,], stat = "stratum", aes(label = after_stat(stratum))) +
      theme_bw()
    

    enter image description here