I am trying to make either a multi node Sankey or Alluvial plot whichever is more appropriate.
The output would be similar to this which is from the ggalluvial
packgage vignette here gg alluvial vignette
The difference would be that my time_period
would be on the x axis and my source
column would replace the survey responses. I also tried the Sankey plot from the networkD3
package for a single time period to get results similar to below from the vignette here Sankey Plot
Which would be a good compromise if I cant visualize all the time periods but it did not work either. My sample data and code is below. Thanks
Data
dat = structure(list(time_period = c("1 -> 2", "1 -> 2", "1 -> 2",
"1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2",
"1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "1 -> 2", "2 -> 3",
"2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3",
"2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3", "2 -> 3",
"2 -> 3", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4",
"3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4", "3 -> 4",
"3 -> 4", "3 -> 4", "3 -> 4", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5",
"4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5",
"4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5", "4 -> 5"), source = c("A",
"A", "A", "A", "B", "B", "B", "B", "C", "C", "C", "C", "D", "D",
"D", "D", "A", "A", "A", "A", "B", "B", "B", "B", "C", "C", "C",
"C", "D", "D", "D", "D", "A", "A", "A", "A", "B", "B", "B", "B",
"C", "C", "C", "C", "D", "D", "D", "D", "A", "A", "A", "A", "B",
"B", "B", "B", "C", "C", "C", "C", "D", "D", "D", "D"), target = c("A",
"B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B",
"C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C",
"D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D",
"A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A",
"B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D"), count = c(200573,
27490, 869, 11330, 22136, 208721, 243, 921, 1552, 266, 97647,
489, 9644, 743, 491, 62900, 179754, 23188, 1111, 9760, 27824,
193337, 228, 769, 858, 159, 83213, 330, 10410, 869, 474, 54946,
188765, 30850, 973, 9485, 22181, 196101, 218, 1012, 1482, 292,
91553, 392, 9989, 724, 431, 50766, 201313, 25308, 1095, 10801,
25842, 206138, 246, 836, 1199, 210, 94152, 362, 8414, 624, 457,
55365)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-64L))
Code:
library(networkD3)
library(tidyverse)
dat = dat %>% filter(time_period == '4 -> 5')
nodes <- data.frame(name=c(as.character(dat$source), as.character(dat$target)) %>% unique())
dat$IDsource=match(dat$source, nodes$name)-1
dat$IDtarget=match(dat$target, nodes$name)-1
# Make the Network
sankeyNetwork(Links = dat, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "count", NodeID = "name",
sinksRight=FALSE, nodeWidth=40, fontSize=13, nodePadding=20)
I gave it a go using ggalluvial
. Each section is plotted quite independently. The height difference between borders can be fixed by adding empty rows.
library(ggalluvial)
library(dplyr)
head(dat)
# time_period source target count
# <chr> <chr> <chr> <dbl>
#1 1 -> 2 A A 200573
#2 1 -> 2 A B 27490
#3 1 -> 2 A C 869
#4 1 -> 2 A D 11330
#5 1 -> 2 B A 22136
#6 1 -> 2 B B 208721
# re-format into long data format
dat2 = NULL
for (i in 1:4) {
tem <- dat %>%
filter(time_period == paste0(i," -> ", i+1)) %>%
{data.frame(
alluvium = paste(i, c(seq(nrow(.)),
seq(nrow(.)))), # same alluvium for each pair
time = c(rep(i+ifelse(i>1,0.0001,0), nrow(.)),
rep(i+1, nrow(.))), # +0.0001 to avoid overlapping with previous section
count = c(.$count, .$count),
group = c(.$source, .$target)
)
}
dat2 <- rbind(dat2, tem)
}
# padding missing values to match borders between sections
for (i in 1:4) {
# right side missing
if (i < 4) {
count_this <- dat %>%
filter(time_period == paste0(i," -> ", i+1)) %>%
group_by(target) %>%
summarise(count = sum(count)) %>%
.$count
count_right <- dat %>%
filter(time_period == paste0(i+1," -> ", i+2)) %>%
group_by(source) %>%
summarise(count = sum(count)) %>%
.$count
count_max <- pmax(count_this, count_right)
dat2 = dat2 %>%
rbind(., data.frame(
alluvium = paste(i, "right_missing", seq(4)),
time = c(rep(i+1, 4)),
count = count_max - count_this, # padding if less count than the right side
group = c("A", "B", "C", "D")
)
)
}
# left side missing
if (i > 1) {
count_left <- dat %>%
filter(time_period == paste0(i-1," -> ", i)) %>%
group_by(target) %>%
summarise(count = sum(count)) %>%
.$count
count_this <- dat %>%
filter(time_period == paste0(i," -> ", i+1)) %>%
group_by(source) %>%
summarise(count = sum(count)) %>%
.$count
count_max <- pmax(count_left, count_this)
dat2 = dat2 %>%
rbind(., data.frame(
alluvium = paste(i, "left_missing", seq(4)),
time = c(rep(i+0.0001, 4)),
count = count_max - count_this, # padding if less count than the left side
group = c("A", "B", "C", "D")
)
)
}
}
dat2 %>%
filter(count>0) %>%
ggplot(aes(x = time, y = count, stratum = group, alluvium = alluvium)) +
geom_flow(aes(fill = group)) +
geom_stratum(data = dat2[dat2$time%%1==0,], aes(fill = group), alpha = 0.3) +
geom_text(data = dat2[dat2$time%%1==0,], stat = "stratum", aes(label = after_stat(stratum))) +
theme_bw()