rggplot2sankey-diagram

Replicate Sankey diagram in ggplot2


I am working with package sfo and prepare Sankey diagram. Below you can see my code and my data.

library(sfo)

sfo_passengers %>%
  filter(activity_period == max(activity_period)) %>%
  group_by(activity_type_code, geo_region) %>%
  summarise(total = sum(passenger_count), .groups = "drop")

sfo_passengers %>% 
  filter(operating_airline == "United Airlines",
         activity_period >= 201901 & activity_period < 202001) %>%
  mutate(terminal = ifelse(terminal == "International", "international", terminal)) %>%
  group_by(operating_airline,activity_type_code, geo_summary, geo_region,  terminal) %>%
  summarise(total = sum(passenger_count), .groups = "drop") %>%
  sankey_ly(cat_cols = c("operating_airline", "terminal","geo_summary", "geo_region", "activity_type_code"), 
            num_col = "total",
            title = "Dist. of United Airlines Passengers at SFO During 2019")

p <- plotly::last_plot()

plotly::add_annotations(p, c("Terminals", "Domestic/Int"), x = c(0.2, 0.5), y = c(1, 1), showarrow = FALSE)

enter image description here

Now I want to replicate this in ggplot2. I tryed with code below but is not working.

# Load required libraries
library(ggplot2)
library(dplyr)

sfo_passengers_filtered <- sfo_passengers %>% 
  filter(activity_period == max(activity_period)) %>%
  group_by(activity_type_code, geo_region) %>%
  summarise(total = sum(passenger_count), .groups = "drop")

sfo_passengers_united <- sfo_passengers %>% 
  filter(operating_airline == "United Airlines",
         activity_period >= 201901 & activity_period < 202001) %>%
  mutate(terminal = ifelse(terminal == "International", "international", terminal)) %>%
  group_by(operating_airline,activity_type_code, geo_summary, geo_region,  terminal) %>%
  summarise(total = sum(passenger_count), .groups = "drop")

# Create the plot
ggplot(data = sfo_passengers_united, aes(x = terminal, y = geo_region, 
                                          fill = activity_type_code, text = paste("Passengers: ", total))) +
  geom_rect(aes(x = terminal, xend = after_scale(terminal), y = geo_region, yend = after_scale(geo_region), 
                fill = activity_type_code, alpha = total), color = "black") +
  geom_text(aes(label = total), size = 3) +
  scale_fill_brewer(palette = "Set1", name = "Activity Type") +
  scale_alpha_continuous(range = c(0.1, 1), guide = FALSE) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Distribution of United Airlines Passengers at SFO During 2019",
       x = "Terminals",
       y = "Geo Region")

Can anybody help me how to replicate this diagram in ggplot2 ?


Solution

  • One possible option to recreate your plotly sankey chart using ggplot2 would be to use ggsankey:

    library(dplyr, warn = FALSE)
    library(sfo)
    library(ggsankey)
    library(ggplot2)
    
    dat <- sfo_passengers %>%
      filter(
        operating_airline == "United Airlines",
        activity_period >= 201901 & activity_period < 202001
      ) %>%
      mutate(terminal = ifelse(terminal == "International", "international", terminal)) %>%
      group_by(operating_airline, activity_type_code, geo_summary, geo_region, terminal) %>%
      summarise(total = sum(passenger_count), .groups = "drop")
    
    df <- dat %>%
      make_long(
        operating_airline, terminal,
        geo_summary, geo_region, activity_type_code,
        value = total
      )
    
    ggplot(df, aes(
      x = x,
      next_x = next_x,
      node = node,
      next_node = next_node,
      fill = factor(node, unique(node))
    )) +
      geom_sankey(flow.fill = "grey") +
      geom_sankey_label(
        aes(
          label = node,
          x = stage(
            x,
            after_stat = x + .075 * if_else(
              x == 5, -1, 1
            )
          ),
          hjust = if_else(
            x == "activity_type_code", 1, 0
          )
        ),
        size = 8 / .pt, fill = "white"
      ) +
      ggsci::scale_fill_d3(palette = "category20") +
      scale_x_discrete(
        breaks = c("terminal", "geo_summary"),
        labels = c("Terminals", "Domestic/Int"),
        position = "top"
      ) +
      guides(fill = "none") +
      ggthemes::theme_map(base_size = 11) +
      theme(
        axis.text.x = element_text()
      ) +
      labs(
        title = "Dist. of United Airlines Passengers at SFO During 2019"
      )
    

    UPDATE When you are restricted to CRAN package then one option would be to use ggalluvial but it's not that close to the plot you are trying to re-create:

    library(dplyr, warn = FALSE)
    library(sfo)
    library(ggalluvial)
    #> Loading required package: ggplot2
    library(ggplot2)
    
    dat <- sfo_passengers %>%
      filter(
        operating_airline == "United Airlines",
        activity_period >= 201901 & activity_period < 202001
      ) %>%
      mutate(terminal = ifelse(terminal == "International", "international", terminal)) %>%
      group_by(operating_airline, activity_type_code, geo_summary, geo_region, terminal) %>%
      summarise(total = sum(passenger_count), .groups = "drop")
    
    df <- to_lodes_form(
      dat,
      axes = c(1, 5, 3:4, 2)
    )
    
    ggplot(
      df,
      aes(
        x = x, stratum = stratum, alluvium = alluvium,
        y = total,
        fill = stratum,
        label = stratum
      )
    ) +
      
      geom_stratum(
        alpha = 1,
        width = .15,
        linewidth = .25,
        color = "white"
      ) +
      geom_flow(
        fill = "grey",
        width = .15,
        linewidth = .25,
        color = "white"
      ) +
      geom_label(
        data = ~subset(.x, x != "geo_region"),
        aes(
          x = stage(
            x,
            after_stat = x + .1 * if_else(
              x == 5, -1, 1
            )
          ),
          hjust = if_else(
            x == "activity_type_code", 1, 0
          )
        ),
        size = 8 / .pt, fill = "white",
        stat = "stratum"
      ) +
      ggrepel::geom_label_repel(
        data = ~subset(.x, x == "geo_region"),
        size = 8 / .pt, fill = "white",
        stat = "stratum",
        direction = "y",
        nudge_x = .15,
        hjust = 0
      ) +
      ggsci::scale_fill_d3(palette = "category20") +
      scale_x_discrete(
        breaks = c("terminal", "geo_summary"),
        labels = c("Terminals", "Domestic/Int"),
        position = "top"
      ) +
      guides(fill = "none") +
      ggthemes::theme_map(base_size = 11) +
      theme(
        axis.text.x = element_text()
      ) +
      labs(
        title = "Dist. of United Airlines Passengers at SFO During 2019"
      )