i have a data frame in R with 2 columns .1rst column named var has 40 categories and column val has the likert values. After likert plot and the count bar plot the result is an ugly non understandable plot (attached picture). Is there a way to display the 20 worst categories (based on "Strongly disagree" and "Disagree") ? The issue would be easy if if it were before the plotting.But now i do not know how to handle it . Any help ?
library(tibble)
# Create 40 categories
var_levels <- paste0("Category_", 1:40)
likert_levels <- c(
"Strongly disagree",
"Disagree",
"Neither agree nor disagree",
"Agree",
"Strongly agree"
)
set.seed(42)
df <- tibble(
var = sample(var_levels, 500, replace = TRUE), # Random values from 40 categories
val = sample(likert_levels, 500, replace = TRUE) # Random values from Likert levels
)
df
df2=df%>%
mutate(across(everything(), as.factor))%>%
group_by(var)%>%
mutate(row = row_number()) %>%
pivot_wider(names_from = var, # The values in 'var' will become new column names
values_from = val)%>%
select(-row)
v1 = ggstats::gglikert(df2)+
aes(y = reorder(.question,
ifelse(
.answer %in% c("Strongly disagree", "Disagree"),
1, 0),FUN = sum),decreasing=TRUE)
v2 <- df %>%
mutate(
var = reorder(var,
ifelse(
val %in% c("Strongly disagree", "Disagree"),
1, 0
),
FUN = sum
)
) |>
count(var, name = "count") %>%
ggplot(., aes(y = var, x = count)) +
geom_bar(stat = "identity", fill = "lightgrey")
ggarrange(v1, v2, widths = c(6, 2))
Here is one possible option which simply computes the relative frequencies before plotting for which I use ave
. based on this calculation you can then pick the worst 20 categories.
Note: Besides Strong disagree and disagree I include half of the count for the neutral category.
library(tibble)
# Create 40 categories
var_levels <- paste0("Category_", 1:40)
likert_levels <- c(
"Strongly disagree",
"Disagree",
"Neither agree nor disagree",
"Agree",
"Strongly agree"
)
set.seed(42)
df <- tibble(
var = sample(var_levels, 500, replace = TRUE), # Random values from 40 categories
val = sample(likert_levels, 500, replace = TRUE) # Random values from Likert levels
)
df
library(tidyverse)
library(ggplot2)
library(ggpubr)
df <- df %>%
mutate(
val = factor(val, likert_levels),
var = reorder(
var,
ave(
as.numeric(val), var,
FUN = \(x) {
sum(x %in% 1:2) / length(x[!is.na(x)])
}
)
)
)
worst20 <- levels(df$var)[-(1:20)]
df2 <- df %>%
filter(var %in% worst20) |>
group_by(var) %>%
mutate(row = row_number()) %>%
pivot_wider(
names_from = var, # The values in 'var' will become new column names
values_from = val,
names_vary = "fastest",
) %>%
select(-row)
v1 <- ggstats::gglikert(df2) +
aes(y = reorder(
factor(.question, levels = levels(df$var)),
ave(
as.numeric(.answer), .question,
FUN = \(x) {
sum(x %in% 1:2) / length(x[!is.na(x)])
}
)
))
v2 <- df %>%
filter(var %in% worst20) |>
count(var, name = "count") %>%
ggplot(., aes(y = var, x = count)) +
geom_bar(stat = "identity", fill = "lightgrey")
ggarrange(v1, v2, widths = c(6, 2))