rggplot2sankey-diagram

Add percentages of patients in the second node of patients coming from first node in Sankey diagram


My question is based on this post How to colour skankey node with all colours of first node that it is related to in ggplot?. You might find a reproducable code in this initial post.

And the question is how to add percentages of patients in the second node of patients coming from first node?

I need something like that enter image description here

I know how to calculate the percentages for each node

geom_sankey_label(aes(
    label = after_stat(paste0(node, "\nn = ", freq," (",
                              scales::percent(ave(freq, x, FUN = \(x) x / sum(x)),accuracy = 0.1),")"
    ))),
    size = 3.5, color = 1, fill = "white" )

But how to split it further to show the percentage coming from first node?

Data from initial post:

df2 = structure(list(x = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), levels = c("Method_Group", 
"Topic"), class = "factor"), node = c("BRUV + Both", "Behavioural Ecology", 
"BRUV + Both", "Conservation Methods", "BRUV + Both", "Other Drivers", 
"Animal Borne + No Receiver", "Behavioural Ecology", "Controlled + Receiver", 
"Behavioural Ecology", "Controlled + Receiver", "Reproductive Ecology", 
"Controlled + Receiver", "Other Drivers", "Controlled + Receiver", 
"Behavioural Ecology", "Controlled + Receiver", "Methodological", 
"Animal Borne + No Receiver", "Behavioural Ecology", "Animal Borne + No Receiver", 
"Methodological", "Stationary + No Receiver", "Reproductive Ecology", 
"Stationary + No Receiver", "Landuse Management", "Stationary + No Receiver", 
"Other Drivers", "Animal Borne + No Receiver", "Behavioural Ecology", 
"Animal Borne + No Receiver", "Methodological", "Animal Borne + No Receiver", 
"Reproductive Ecology", "Stationary + Receiver", "Behavioural Ecology", 
"Stationary + Receiver", "Fisheries Managemenet", "Stationary + Receiver", 
"Behavioural Ecology", "Stationary + Receiver", "Methodological", 
"Stationary + Receiver", "Fisheries Managemenet", "BRUV + Both", 
"Behavioural Ecology", "BRUV + Both", "Methodological", "BRUV + Both", 
"Conservation Methods"), next_x = structure(c(2L, NA, 2L, NA, 
2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 
2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 
2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA), levels = c("Method_Group", 
"Topic"), class = "factor"), next_node = c("Behavioural Ecology", 
NA, "Conservation Methods", NA, "Other Drivers", NA, "Behavioural Ecology", 
NA, "Behavioural Ecology", NA, "Reproductive Ecology", NA, "Other Drivers", 
NA, "Behavioural Ecology", NA, "Methodological", NA, "Behavioural Ecology", 
NA, "Methodological", NA, "Reproductive Ecology", NA, "Landuse Management", 
NA, "Other Drivers", NA, "Behavioural Ecology", NA, "Methodological", 
NA, "Reproductive Ecology", NA, "Behavioural Ecology", NA, "Fisheries Managemenet", 
NA, "Behavioural Ecology", NA, "Methodological", NA, "Fisheries Managemenet", 
NA, "Behavioural Ecology", NA, "Methodological", NA, "Conservation Methods", 
NA)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))

Code from initial post:

library(ggsankey)
library(dplyr)
library(ggplot2)

width <- .4
p <- ggplot(df2, aes(x = x, next_x = next_x, node = node, next_node = next_node, fill = factor(node), label = node)) +
  geom_sankey(flow.alpha = 1, node.color = "black", show.legend = FALSE, width = width, linewidth = 2) +
  theme_void() +
  theme(
    plot.margin = unit(rep(5.5, 4), "pt")
  ) +
  scale_fill_viridis_d()

# Get the data from the flows layer
dat <- layer_data(last_plot(), 1) |>
  filter(x ==  2 - width / 2) |>
  distinct(fill, flow_end_ymax, .keep_all = TRUE)

# Get the data from the boxes layer
dat1 <- layer_data(last_plot(), 2) |>
  filter(xmin == 2 - width / 2)
p +
  geom_rect(data = dat, aes(
    xmin = x, xmax = x + width,
    ymin = flow_end_ymin, ymax = flow_end_ymax,
    fill = label
  ), inherit.aes = FALSE) +
  # Draw the outline
  geom_rect(data = dat1, aes(
    xmin = xmin, xmax = xmax,
    ymin = ymin, ymax = ymax
  ), inherit.aes = FALSE, fill = NA, color = "black", linewidth = 2) +
  geom_sankey_label(size = 5, color = "black", fill = "white") +
  guides(fill = "none")

Solution

  • Here is one option which extends the approach from the referenced post to add labels for the proportions and where I shifted the labels for the final nodes slightly to the right:

    library(ggsankey)
    library(dplyr, warn = FALSE)
    library(ggplot2)
    
    dat2 <- dat |>
      # Filter for flows to "Behavioural Ecology"
      semi_join(
        dat1 |> filter(grepl("^Beh", node)),
        by = join_by(
          flow_end_ymin >= ymin,
          flow_end_ymax <= ymax
        )
      ) |>
      # Add the proportions
      left_join(
        df2 |>
          count(node, next_node) |>
          mutate(prop = n / sum(n), .by = next_node) |>
          filter(grepl("^Beh", next_node)),
        by = c(label = "node")
      )
    
    p +
      geom_rect(data = dat, aes(
        xmin = x, xmax = x + width,
        ymin = flow_end_ymin, ymax = flow_end_ymax,
        fill = label
      ), inherit.aes = FALSE) +
      geom_rect(data = dat1, aes(
        xmin = xmin, xmax = xmax,
        ymin = ymin, ymax = ymax
      ), inherit.aes = FALSE, fill = NA, color = "black", linewidth = 2) +
      geom_text(
        data = dat2,
        aes(
          x = x + width / 2,
          y = .5 * (flow_end_ymin + flow_end_ymax),
          label = scales::percent(prop, accuracy = 1),
        ),
        color = "white",
        inherit.aes = FALSE
      ) +
      geom_sankey_label(
        aes(
          x = stage(x, after_stat = x + width / 2 * ifelse(x == min(x), 0, .5)),
          hjust = after_stat(ifelse(x == min(x), .5, 0))
        ),
        size = 5, color = "black", fill = "white"
      ) +
      guides(fill = "none")
    

    enter image description here