rdataframeggplot2

subset 2 ggplots in R to decrease the y axis


i have a data frame in R with 2 columns .1rst column named var has 40 categories and column val has the likert values. After likert plot and the count bar plot the result is an ugly non understandable plot (attached picture). Is there a way to display the 20 worst categories (based on "Strongly disagree" and "Disagree") ? The issue would be easy if if it were before the plotting.But now i do not know how to handle it . Any help ?

library(tibble)

# Create 40 categories
var_levels <- paste0("Category_", 1:40)

likert_levels <- c(
  "Strongly disagree",
  "Disagree",
  "Neither agree nor disagree",
  "Agree",
  "Strongly agree"
)

set.seed(42)
df <- tibble(
  var = sample(var_levels, 500, replace = TRUE),  # Random values from 40 categories
  val = sample(likert_levels, 500, replace = TRUE)  # Random values from Likert levels
)

df

df2=df%>%
  mutate(across(everything(), as.factor))%>%
  group_by(var)%>%
  mutate(row = row_number()) %>%
  pivot_wider(names_from = var,   # The values in 'var' will become new column names
              values_from = val)%>%
  select(-row)

v1 = ggstats::gglikert(df2)+
  aes(y = reorder(.question,
                  ifelse(
                    .answer %in% c("Strongly disagree", "Disagree"),
                    1, 0),FUN = sum),decreasing=TRUE)


v2 <- df %>%
  mutate(
    var = reorder(var,
                  ifelse(
                    val %in% c("Strongly disagree", "Disagree"),
                    1, 0
                  ),
                  FUN = sum
    )
  ) |>
  count(var, name = "count") %>%
  ggplot(., aes(y = var, x = count)) +
  geom_bar(stat = "identity", fill = "lightgrey")

ggarrange(v1, v2, widths = c(6, 2))

enter image description here


Solution

  • Here is one possible option which simply computes the relative frequencies before plotting for which I use ave. based on this calculation you can then pick the worst 20 categories.

    Note: Besides Strong disagree and disagree I include half of the count for the neutral category.

    library(tibble)
    
    # Create 40 categories
    var_levels <- paste0("Category_", 1:40)
    
    likert_levels <- c(
      "Strongly disagree",
      "Disagree",
      "Neither agree nor disagree",
      "Agree",
      "Strongly agree"
    )
    
    set.seed(42)
    df <- tibble(
      var = sample(var_levels, 500, replace = TRUE), # Random values from 40 categories
      val = sample(likert_levels, 500, replace = TRUE) # Random values from Likert levels
    )
    
    df
    
    library(tidyverse)
    library(ggplot2)
    library(ggpubr)
    
    df <- df %>%
      mutate(
        val = factor(val, likert_levels),
        var = reorder(
          var,
          ave(
            as.numeric(val), var,
            FUN = \(x) {
              sum(x %in% 1:2) / length(x[!is.na(x)])
            }
          )
        )
      )
    
    worst20 <- levels(df$var)[-(1:20)]
    
    df2 <- df %>%
      filter(var %in% worst20) |>
      group_by(var) %>%
      mutate(row = row_number()) %>%
      pivot_wider(
        names_from = var, # The values in 'var' will become new column names
        values_from = val,
        names_vary = "fastest",
      ) %>%
      select(-row)
    
    v1 <- ggstats::gglikert(df2) +
      aes(y = reorder(
        factor(.question, levels = levels(df$var)),
        ave(
          as.numeric(.answer), .question,
          FUN = \(x) {
            sum(x %in% 1:2) / length(x[!is.na(x)])
          }
        )
      ))
    
    v2 <- df %>%
      filter(var %in% worst20) |>
      count(var, name = "count") %>%
      ggplot(., aes(y = var, x = count)) +
      geom_bar(stat = "identity", fill = "lightgrey")
    
    ggarrange(v1, v2, widths = c(6, 2))
    

    enter image description here