rggplot2levelsdropstacked-bar-chart

Keeping unused levels in ggplot2 bar plot does not work properly


R version 4.3.1 library(ggplot2) version 3.4.2

I want to use ggplot2 bar charts to display the percentage frequency of a particular concentration class. I have five concentration classes (=c1 with the options "1_", "2_", "3_", "4_", "5_") for which an event can occur "Early" (=c2 dichotomous with "yes" or "no") or "Late" (=c3 dichotomous with "yes" or "no").

To prevent bars from dropping in the diagram, I have set "scale_x_discrete (drop = FALSE)" in the ggplot.

If an event occurs in one, three, four or all five concentration classes, bar charts are drawn as I imagined them: no bar is drawn for the concentrations for which no event occurred, but all five concentration classes are displayed on the x-axis.

If an event occurs in only two concentration classes, then instead of the two thin bars at the respective concentrations, adjacent bold bars are drawn.

What is the reason for this problem and how can I solve it? Thank you very much in advance for any advice.

This is the code I used:

# build data.frame
c1 <- c("1_", "1_", "2_", "2_", "3_", "4_", "4_", "4_", "5_", "5_")
c2 <- c("yes", "yes", "no", "no", "no", "no", "no", "no", "no", "no")
c3 <- c("no", "no", "yes", "no", "yes", "yes", "no", "no", "no", "yes") # for five bars
c3 <- c("no", "no", "yes", "no", "yes", "no", "no", "no", "no", "yes") # for four bars
c3 <- c("no", "no", "yes", "no", "no", "no", "no", "no", "no", "yes") # for three bars
c3 <- c("no", "no", "no", "no", "no", "no", "no", "no", "no", "yes") # for two bars --> THIS is where the error occurs
c3 <- c("no", "no", "no", "no", "no", "no", "no", "no", "no", "no") # for one bars

dataf <- data.frame(c1, c2, c3)
dataf

# compilation of data + addition of column with event status + reduction of table to relevant columns
bp11 <- dataf %>% mutate(c.Compilation = case_when(c2 == "yes" ~ "EarlyEvent",
                                                   c3 == "yes" ~ "LateEvent")) %>%
  select(c1, c.Compilation)
bp11

# insert number (n) and percentage share (prop)
bp22 <- bp11 %>% group_by(c1) %>% count(c.Compilation) %>% mutate(prop = n / sum(n) * 100)

# removal of NA
bp33 <- bp22[complete.cases(bp22$c.Compilation), ]

# specify factor level order "Groups"
bp33$c.Compilation = factor(bp33$c.Compilation, levels = c("LateEvent", "EarlyEvent"))

# specify factor level order "Concentration"
bp33$c1 = factor(bp33$c1, levels = c("1_", "2_", "3_", "4_", "5_"))

# calculate n of each concentration group
calcN <- table(bp11$c1)
calcN

# draw stacked barplot
ggplot(bp33,
       aes(x = c1, y = prop, fill = c.Compilation)) +
  geom_bar(stat = "identity",
           position = "stack") +
  labs(x = "Concentration (mg/L)",
       y = "Event (%)",
       fill = "Groups",
       title = "Css") +
  theme(axis.text = element_text(size = 15, face = "bold"),
        axis.title = element_text(size = 15, face = "bold"),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15),
        plot.title = element_text(size = 20, hjust = 0.5)) +
  theme(legend.position = c(.18, .84)) +
  scale_x_discrete(breaks = c("1_", "2_", "3_", "4_", "5_"),
                   drop = FALSE) +
  scale_y_continuous(limit = c(-1, 101), breaks = c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100)) +
  annotate("text", x = c(0.9, 1.15), y = -1, label = c("n = ", calcN[c("1_")]), size = 5) +  
  annotate("text", x = c(1.9, 2.15), y = -1, label = c("n = ", calcN[c("2_")]), size = 5) +
  annotate("text", x = c(2.9, 3.15), y = -1, label = c("n = ", calcN[c("3_")]), size = 5) +
  annotate("text", x = c(3.9, 4.15), y = -1, label = c("n = ", calcN[c("4_")]), size = 5) +
  annotate("text", x = c(4.9, 5.15), y = -1, label = c("n = ", calcN[c("5_")]), size = 5) +
  scale_fill_grey(start = 0.6, end = 0)

This is the result I get when I run the code:

Five bars in the bar chart

Four bars in the bar chart

Three bars in the bar chart

Two bars in the bar chart, this is where the error occurs

One bar in the bar chart


Solution

  • This looks like a bug to me (which after some testing occurs only with stat="identity" and when there are no adjacent categories).

    NOTE: The issue was already reported and is fixed in the development version of ggplot2. (see below).

    A workaround would be to manually complete your dataset using e.g. tidyr::complete:

    library(ggplot2)
    library(dplyr, warn = FALSE)
    library(tidyr)
    
    bp33_complete <- bp33 |>
      ungroup() |>
      complete(c1, c.Compilation, fill = list(prop = 0))
    
    ggplot(
      bp33_complete,
      aes(x = c1, y = prop, fill = c.Compilation)
    ) +
      geom_col() +
      scale_x_discrete(
        breaks = c("1_", "2_", "3_", "4_", "5_"),
        drop = FALSE
      ) +
      scale_y_continuous(
        breaks = seq(5, 100, 5),
        limits = c(-1, 101)
      ) +
      labs(
        x = "Concentration (mg/L)",
        y = "Event (%)",
        fill = "Groups",
        title = "Css"
      ) +
      theme(
        axis.text = element_text(size = 15, face = "bold"),
        axis.title = element_text(size = 15, face = "bold"),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15),
        plot.title = element_text(size = 20, hjust = 0.5)
      ) +
      theme(legend.position = c(.18, .84)) +
      geom_text(
        aes(
          label = after_stat(paste0("n = ", count)),
          y = after_stat(-1),
          fill = NULL
        ),
        data = bp11,
        stat = "count",
        size = 5
      ) +
      scale_fill_grey(start = 0.6, end = 0)
    

    enter image description here

    DATA

    c1 <- c("1_", "1_", "2_", "2_", "3_", "4_", "4_", "4_", "5_", "5_")
    c2 <- c("yes", "yes", "no", "no", "no", "no", "no", "no", "no", "no")
    # for two bars --> THIS is where the error occurs
    c3 <- c("no", "no", "no", "no", "no", "no", "no", "no", "no", "yes") 
    
    dataf <- data.frame(c1, c2, c3)
    

    EDIT Here is a refactored code which uses only one dataset, keeps the NAs (but drops them for the geom_col) and uses geom_text with stat="summary" to compute the n= which should also work for cases with n=0.

    library(ggplot2)
    library(dplyr, warn = FALSE)
    library(tidyr)
    
    # Drop one c1 category
    dataf <- filter(dataf, c1 != "4_")
    
    bp33 <- dataf %>%
      mutate(c.Compilation = case_when(
        c2 == "yes" ~ "EarlyEvent",
        c3 == "yes" ~ "LateEvent"
      )) %>%
      count(c1, c.Compilation) %>%
      mutate(prop = n / sum(n) * 100, .by = c1) |>
      mutate(
        c.Compilation = factor(c.Compilation,
          levels = c("LateEvent", "EarlyEvent")
        ),
        c1 = factor(c1,
          levels = c("1_", "2_", "3_", "4_", "5_")
        )
      ) |>
      complete(c1, c.Compilation, fill = list(prop = 0, n = 0))
    
    ggplot(
      bp33,
      aes(x = c1, y = prop)
    ) +
      geom_col(
        data = ~ filter(.x, !is.na(c.Compilation)),
        aes(fill = c.Compilation)
      ) +
      scale_x_discrete(
        breaks = c("1_", "2_", "3_", "4_", "5_"),
        drop = FALSE
      ) +
      scale_y_continuous(
        breaks = seq(5, 100, 5),
        limits = c(-1, 101)
      ) +
      labs(
        x = "Concentration (mg/L)",
        y = "Event (%)",
        fill = "Groups",
        title = "Css"
      ) +
      theme(
        axis.text = element_text(size = 15, face = "bold"),
        axis.title = element_text(size = 15, face = "bold"),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15),
        plot.title = element_text(size = 20, hjust = 0.5)
      ) +
      theme(legend.position = c(.18, .84)) +
      geom_text(
        aes(
          label = after_stat(paste0("n = ", y)),
          y = stage(n, after_stat = -1),
          fill = NULL,
          group = c1
        ),
        stat = "summary",
        fun = sum,
        size = 5
      ) +
      scale_fill_grey(
        start = 0.6, end = 0
      )
    

    Update The issue was already reported here and here and seems to be fixed in the development version of ggplot2:

    Using ggplot2 3.4.4:

    library(ggplot2)
    
    packageVersion("ggplot2")
    #> [1] '3.4.4'
    
    dat <- data.frame(
      x = factor(c("A", "E"), levels = LETTERS[1:5])
    )
    
    ggplot(dat, aes(x, y = 1)) +
      geom_bar(stat = "identity") + # or geom_col
      scale_x_discrete(drop = FALSE)
    

    Created on 2023-12-30 with reprex v2.0.2

    Using the development version:

    library(ggplot2)
    
    packageVersion("ggplot2")
    #> [1] '3.4.4.9000'
    
    dat <- data.frame(
      x = factor(c("A", "E"), levels = LETTERS[1:5])
    )
    
    ggplot(dat, aes(x, y = 1)) +
      geom_bar(stat = "identity") + # or geom_col
      scale_x_discrete(drop = FALSE)