rggplot2dplyr

Sort two plots using ggplot2 based on a combination of level percentages in R


I have a data frame in R with two columns country and smoke both factors.

I want to change the sorting of left plot (see image plot) based on the descreasing (by country) sum "smoke" and "vaping". Right now it has no sorting. For example, based on simulated data in the picture France must be on top and below then USA and then UK and at the bottom Germany.

Also this sorting of countries to pass in the second plot. Ie it must be France,USA,UK,Germany.

library(dplyr)
library(ggplot2)
library(forcats)
set.seed(123)  # Setting seed for reproducibility
levels_country  = c('USA', 'UK', 'FRANCE', 'GERMANY')
country = sample(levels_country, 50, replace = TRUE)
levels_smoke = c('smoke', 'not smoke', 'vaping')
smoke   = sample(levels_smoke, 50, replace = TRUE)
df = tibble(country,smoke) %>%
  mutate(
    country = factor(country, levels = levels_country),
    smoke = factor(smoke, levels = levels_smoke)
  )


Grouped = df %>% 
  dplyr::group_by(country,smoke) %>% 
  dplyr::summarise(n = n()) %>% 
  dplyr::group_by(country) %>% 
  dplyr::mutate(summed=sum(n)) 

Grouped = Grouped %>% 
  dplyr::mutate(percentage = n/summed )

ordered_countries = Grouped %>%
  dplyr::filter(smoke=="smoke" | smoke=="not smoke") %>% 
  dplyr::group_by(country) %>% 
  dplyr::summarise(percentage = sum(percentage)) %>% 
  dplyr::arrange(desc(percentage)) %>% 
  dplyr::select(country)

ranking = as.vector(ordered_countries$country)
ranking = (ordered_countries$country)


smoking_col <- c("red1","salmon","green3")
g1 = ggplot(Grouped, 
            aes(x = country,
                y = percentage , 
                fill = smoke))+
  geom_col(stat="identity",position = position_fill(reverse = TRUE))+ 
  scale_fill_manual(values = smoking_col ,limits = c("smoke", "vaping"  ,"not smoke" ),
                    breaks = c("smoke", "vaping" , "not smoke" ), 
                    labels = c("smoke", "vaping" , "not smoke" ))+
  coord_flip() +
  theme_light()+
  theme(legend.position="none",axis.title.y=element_blank(),axis.title.x=element_blank()) + 
  theme(axis.text.y=element_text(size=13, angle=0,hjust=0,vjust=0) , axis.text.x=element_text(size=13)) +
  scale_y_continuous(labels = percent)
g1 

g2 = ggplot(df, aes(x = country))+
  geom_bar(aes(y = (..count..))) + 
  geom_text(size = 4.75, aes(y = ((..count..)), label = (..count..)), stat = "count", hjust = -0.15) +
  coord_flip() + 
  theme_minimal()+
  theme(legend.position="none",
        legend.text = element_text(size = 15),  
        legend.title = element_text(size = 15), 
      
        axis.text.x=element_text(size=13))+ 
  expand_limits(y=c(0,1300))



grid.arrange(g1,g2, ncol=2, widths = c(3,1.2))

Resulting to :

enter image description here

Grouped
# A tibble: 12 × 5
# Groups:   country [4]
   country smoke         n summed percentage
   <fct>   <fct>     <int>  <int>      <dbl>
 1 USA     smoke         4     13      0.308
 2 USA     not smoke     3     13      0.231
 3 USA     vaping        6     13      0.462
 4 UK      smoke         4     13      0.308
 5 UK      not smoke     6     13      0.462
 6 UK      vaping        3     13      0.231
 7 FRANCE  smoke         9     17      0.529
 8 FRANCE  not smoke     3     17      0.176
 9 FRANCE  vaping        5     17      0.294
10 GERMANY smoke         2      7      0.286
11 GERMANY not smoke     4      7      0.571
12 GERMANY vaping        1      7      0.143

Solution

  • Here is an approach which simplifies your code a bit and uses reorder to order country by (the sum of) the proportions of smoking and vaping:

    library(dplyr, warn = FALSE)
    library(ggplot2)
    
    Grouped <- df %>%
      mutate(smoke = factor(smoke, levels = c("smoke", "vaping", "not smoke"))) |>
      count(country, smoke) %>%
      mutate(percentage = n / sum(n), .by = country) |>
      mutate(
        country = reorder(
          country,
          ifelse(smoke %in% c("smoke", "vaping"), percentage, NA),
          FUN = sum, na.rm = TRUE
        )
      )
    
    smoking_col <- c("red1", "salmon", "green3")
    
    g1 <- ggplot(
      Grouped,
      aes(
        x = country,
        y = percentage,
        fill = smoke
      )
    ) +
      geom_col(position = position_stack(reverse = TRUE)) +
      scale_fill_manual(
        values = setNames(smoking_col, c("smoke", "vaping", "not smoke"))
      ) +
      coord_flip() +
      theme_light() +
      theme(
        legend.position = "none",
        axis.title.y = element_blank(),
        axis.title.x = element_blank()
      ) +
      theme(
        axis.text.y = element_text(size = 13, angle = 0, hjust = 0, vjust = 0),
        axis.text.x = element_text(size = 13)
      ) +
      scale_y_continuous(labels = scales::percent)
    
    g2 <- Grouped |>
      count(country, wt = n) |>
      ggplot(aes(x = country, y = n)) +
      geom_col() +
      geom_label(aes(label = n), hjust = 0, size = 4.75, fill = NA, label.size = 0) +
      coord_flip() +
      theme_minimal() +
      theme(
        legend.position = "none",
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 15),
        axis.text.x = element_text(size = 13)
      ) +
      scale_y_continuous(expand = expansion(add = c(0, 5)))
    
    gridExtra::grid.arrange(g1, g2, ncol = 2, widths = c(3, 2))