rdata-visualizationsankey-diagramnetworkd3

How to prepare input data for a sankey diagrams in R?


I am trying to produce a sankey diagram in R, which is also referred as a river plot. I've seen this question Sankey Diagrams in R? where a broad variaty of packages producing sankey diagrams are listed. Since I have input data and know different tools/packages I can produce such diagram BUT my euqestion is: how can I prepare input data for such?

Let's assume we would like to present how users have migrated between various states over 10 days and have start data set like the one below:

data.frame(userID = 1:100,
                     day1_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day2_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day3_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day4_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day5_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day6_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day7_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day8_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day9_state = sample(letters[1:8], replace = TRUE, size = 100),
                     day10_state = sample(letters[1:8], replace = TRUE, size = 100)
                     ) -> dt

Now if one would like to create a sankey diagram with networkD3 package how should one tranform this dt data.frame into required input

so that we would have input like from this example

library(networkD3)
URL <- paste0(
        "https://cdn.rawgit.com/christophergandrud/networkD3/",
        "master/JSONdata/energy.json")
Energy <- jsonlite::fromJSON(URL)
# Plot
sankeyNetwork(Links = Energy$links, Nodes = Energy$nodes, Source = "source",
             Target = "target", Value = "value", NodeID = "name",
             units = "TWh", fontSize = 12, nodeWidth = 30)

EDIT

I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:

https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R


Solution

  • I have found such script which prepares data in other situation and reproduced it so I assume it might be closed now:

    https://github.com/mi2-warsaw/JakOniGlosowali/blob/master/sankey/sankey.R

    Then this code generates such sankey diagram for mentioned in question data.frame

    fixtable <- function(...) {
        tab <- table(...)
        if (substr(colnames(tab)[1],1,1) == "_" &
                    substr(rownames(tab)[1],1,1) == "_") {
            tab2 <- tab
            colnames(tab2) <- sapply(strsplit(colnames(tab2), split=" "), `[`, 1)
            rownames(tab2) <- sapply(strsplit(rownames(tab2), split=" "), `[`, 1)
            tab2[1,1] <- 0
            # mandat w klubie
            for (par in names(which(tab2[1,] > 0))) {
                delta = min(tab2[par, 1], tab2[1, par])
                tab2[par, par] = tab2[par, par] + delta
                tab2[1, par] = tab2[1, par] - delta
                tab2[par, 1] = tab2[par, 1] - delta
            }
            # przechodzi przez niezalezy
            for (par in names(which(tab2[1,] > 0))) {
                tab2["niez.", par] = tab2["niez.", par] + tab2[1, par]
                tab2[1, par] = 0
            }
            for (par in names(which(tab2[,1] > 0))) {
                tab2[par, "niez."] = tab2[par, "niez."] + tab2[par, 1]
                tab2[par, 1] = 0
            }
    
            tab[] <- tab2[] 
        }
        tab
    }
    
    
    flow2 <- rbind(
        data.frame(fixtable(z = paste0(dat$day1_state, " day1"), do = paste0(dat$day2_state, " day2"))),
        data.frame(fixtable(z = paste0(dat$day2_state, " day2"), do = paste0(dat$day3_state, " day3"))),
        data.frame(fixtable(z = paste0(dat$day3_state, " day3"), do = paste0(dat$day4_state, " day4"))),
        data.frame(fixtable(z = paste0(dat$day4_state, " day4"), do = paste0(dat$day5_state, " day5"))),
        data.frame(fixtable(z = paste0(dat$day5_state, " day5"), do = paste0(dat$day6_state, " day6"))),
        data.frame(fixtable(z = paste0(dat$day6_state, " day6"), do = paste0(dat$day7_state, " day7"))),
        data.frame(fixtable(z = paste0(dat$day7_state, " day7"), do = paste0(dat$day8_state, " day8"))),
        data.frame(fixtable(z = paste0(dat$day8_state, " day8"), do = paste0(dat$day9_state, " day9"))),
        data.frame(fixtable(z = paste0(dat$day9_state, " day9"), do = paste0(dat$day10_state, " day10"))))
    
    flow2 <- flow2[flow2[,3] > 0,]
    
    nodes2 <- data.frame(name=unique(c(levels(factor(flow2[,1])), levels(factor(flow2[,2])))))
    nam2 <- seq_along(nodes2[,1])-1
    names(nam2) <- nodes2[,1]
    
    links2 <- data.frame(source = nam2[as.character(flow2[,1])],
                                            target = nam2[as.character(flow2[,2])],
                                            value = flow2[,3])
    
    sankeyNetwork(Links = links, Nodes = nodes,
                                Source = "source", Target = "target",
                                Value = "value", NodeID = "name",
                                fontFamily = "Arial", fontSize = 12, nodeWidth = 40,
                                colourScale = "d3.scale.category20()")