Hello everyone. This is my first post here - I am trying my best to give you everything needed to be able to help me, hopefully I do not forget anything.
I am trying to build a loop that creates stacked bar plots using ggplots with loops but I fail because the loop variables to not get recognised in both ggplot labels and the 'group_by' command.
First, let me share you a part of my data frame:
Germany <-
structure(
list(
Weight = structure(
c(
0.9254366,
0.9673721,
1.1321498,
2.7208848,
0.7328256,
0.9142997,
1.53218,
0.9577866,
0.2420226,
0.7830253,
1.1321498,
0.9828443,
0.9770092,
0.7830253,
0.8787283
),
label = "Weight",
format.spss = "F10.7",
display_width = 12L
),
Q6 = structure(
c(10L, 9L, 10L, 6L, 10L, 10L, 10L, 10L, 8L,
7L, 9L, 10L, 7L, 10L, 10L),
levels = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "99"),
labels = c(
`1 - not at all important` = 1,
`10 - absolutely important` = 10,
`No answer` = 99
),
label = "Q6: Support for Democracy - How important is it for you to live in a country that is governed democratically?",
class = "factor"
),
Q7 = structure(
c(3L, 8L, 10L, 8L, 10L, 4L, 8L, 8L, 5L, 4L,
10L, 10L, 9L, 2L, 9L),
levels = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10", "99"),
labels = c(
`1 - not at all democratic` = 1,
`10 - completely democratic` = 10,
`No answer` = 99
),
label = "Q7: Support for Democracy - And how democratically is #COUNTRY_NAME being governed today?",
class = "factor"
),
Q8 = structure(
c(4L, 9L, 9L, 7L, 10L, 4L, 8L, 8L, 9L, 2L,
7L, 9L, 5L, 2L, 8L),
levels = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10", "99"),
labels = c(
`1 - not satisfied` = 1,
`10 - very satisfied` = 10,
`No answer` = 99
),
label = "Q8: Support for Democracy - Overall, could you tell me how satisfied you are with the way democracy works in #COUNTRY_NAME?",
class = "factor"
),
Q9 = structure(
c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
2L, 1L, 2L, 1L, 2L),
levels = c("1", "2", "3", "4", "99"),
labels = c(
`Strongly agree` = 1,
Agree = 2,
Disagree = 3,
`Strongly disagree` = 4,
`No answer` = 99
),
label = "Q9: Support for Democracy - To what extent do you agree or disagree with this statement?",
class = "factor"
),
D2_GENDER_BINARY = structure(
c(2L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L),
levels = c("1", "2"),
labels = c(Male = 1,
Female = 2),
label = "Gender binary",
class = "factor"
),
D1a_AGEGROUPS_75 = structure(
c(3L, 1L, 4L, 2L, 5L, 3L, 1L,
5L, 1L, 1L, 4L, 3L, 1L, 1L, 2L),
levels = c("18-29", "30-39",
"40-49", "50-65", "66-75"),
labels = c(
`Younger than 18` = 1,
`18-29` = 2,
`30-39` = 3,
`40-49` = 4,
`50-65` = 5,
`66-75` = 6
),
label = "Age Groups (max age 75)",
class = "factor"
),
Q5_14 = structure(
c(1L, 6L, 2L, 2L, 1L, 2L, 2L, 1L, 4L, 2L,
1L, 10L, 7L, 1L, 5L),
levels = c("1", "2", "3", "4", "5",
"6", "7", "8", "9", "10"),
labels = c(
`1 - not at all democratic` = 1,
`10 - completely democratic` = 10,
`No answer` = 99
),
label = "Q5: Support for Democracy - The country’s security agencies collect data on their citizens’ internet activity : Countries around the world differ in how democratic they are. We sampled the following practices from around the world. How democratic do yo",
class = "factor"
),
Q14 = structure(
c(39L, 9L, 7L, 6L, 8L, 9L, 8L, 11L, 7L, 11L,
39L, 6L, 7L, 7L, 8L),
levels = c(
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"96",
"99"
),
labels = c(
Reform = 1,
Centre = 2,
EKRE = 3,
SDE = 4,
Isamaa = 5,
`CDU/CSU` = 6,
SPD = 7,
`Bündnis 90/Die Grünen` = 8,
AfD = 9,
FDP = 10,
`Die Linke` = 11,
`United Right (Zjednoczona Prawica)` = 12,
`Civic Coalition (Koalicja Obywatelska)` = 13,
`The Left (Lewica)` = 14,
`Polish Coalition (Koalicja Polska)` = 15,
`Confederation (Konfederacja)` = 16,
`SNS coalition` = 17,
`SPS-JS-KP-ZS` = 18,
SPAS = 19,
VMSZ = 20,
`SPP-DPM` = 21,
PSOE = 22,
PP = 23,
Vox = 24,
Ciudadanos = 25,
`Unidas Podemos` = 26,
ERC = 27,
Socialdemokraterna = 28,
Moderaterna = 29,
Sverigedemokraterna = 30,
Vänsterpartiet = 31,
Centerpartiet = 32,
`Miljöpartiet de Gröna` = 33,
`Servant of the People (Sluha Narodu)` = 34,
`Opposition Platform- for Life` = 35,
Fatherland = 36,
`European Solidarity` = 37,
`Voice (Holos)` = 38,
Other = 96,
`No answer` = 99
),
label = "Q14: Which party did you vote in the #DATE1 parliamentary election? Please click on the answer option that applies to you",
class = "factor"
),
Q20 = structure(
c(10L, 11L, 7L, 6L, 8L, 11L, 8L, 10L, 7L,
9L, 40L, 6L, 7L, 7L, 8L),
levels = c(
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"97",
"98",
"99"
),
labels = c(
Reform = 1,
Centre = 2,
EKRE = 3,
SDE = 4,
Isamaa = 5,
`CDU/CSU` = 6,
SPD = 7,
`Bündnis 90/Die Grünen` = 8,
FDP = 9,
`Die Linke` = 10,
AfD = 11,
PiS = 12,
PO = 13,
Polska2050 = 14,
Lewica = 15,
Konfederacja = 16,
SNS = 17,
SPS = 18,
`Dosta je bilo` = 19,
SSP = 20,
Dveri = 21,
`Demokratska Stranka` = 22,
PSOE = 23,
PP = 24,
Vox = 25,
Ciudadanos = 26,
`Unidas Podemos` = 27,
ERC = 28,
Socialdemokraterna = 29,
Moderaterna = 30,
Sverigedemokraterna = 31,
Vänsterpartiet = 32,
Centerpartiet = 33,
`Miljöpartiet de Gröna` = 34,
`Servant of the People (Sluha Narodu)` = 35,
`Opposition Platform- for Life` = 36,
Fatherland = 37,
`European Solidarity` = 38,
`Voice (Holos)` = 39,
Other = 97,
`Would not vote` = 98,
`No answer` = 99
),
label = "Q20: Party Preferences - If there were a parliamentary election in the following days, which party would you vote for?",
class = "factor"
),
Q42 = structure(
c(2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 1L, 2L, 2L),
levels = c("1", "2"),
labels = c(
`Most people can be trusted` = 1,
`Need to be very careful` = 2
),
label = "Q42: Interest in Politics - Generally speaking, would you say that most people can be trusted or that you need to be very careful in dealing with people?",
class = "factor"
)
),
row.names = c(NA,-15L),
class = c("tbl_df", "tbl", "data.frame")
)
When producing one single stacked bar plot, it works without any issues using the following code:
library(tidyverse)
library(sjlabelled)
library(surveytoolbox) # install with devtools::install_github("martinctc/surveytoolbox")
library(hrbrthemes)
# Define function to wrap long labels
wrap_label <- function(label, width) {
str_wrap(label, width = width)
}
# Compute weighted counts by answer and age group
counts <- Germany %>%
filter(!is.na(Q6)) %>%
group_by(Q6, D1a_AGEGROUPS_75) %>%
summarise(weighted_count = sum(Weight)) %>%
ungroup()
# Compute weighted percentages by answer and age group and
# total number of observations and percentage for each answer option
total_obs <- sum(Germany$Weight)
percentages <- counts %>%
group_by(Q6) %>%
mutate(
total_weight = sum(weighted_count),
weighted_pct = weighted_count / total_weight * 100,
total_weighted_pct = sum(weighted_count) / total_obs * 100
) %>%
ungroup()
# The plot itself
# Plot stacked bar chart with percentages
ggplot(percentages, aes(x = Q6, y = weighted_pct, fill = D1a_AGEGROUPS_75)) +
geom_col() +
scale_fill_brewer(name = var_labels("D1a_AGEGROUPS_75"), palette = "Set2") +
# Add text labels with percentage for each age group on top of bars
geom_text(
aes(label = round(weighted_pct), group = D1a_AGEGROUPS_75),
position = position_stack(vjust = 0.5),
size = 3,
color = "white"
) +
# Add text label with percentage for each answer option above bars
geom_text(aes(
x = Q6,
y = 100 + 2.5,
label = paste0(round(total_weighted_pct), "%")
),
size = 3) +
labs(
x = wrap_label(var_labels("Answer"), width = 60),
y = "Weighted percentage",
title = str_wrap(get_label(Germany$Q6)),
subtitle = paste0("Germany (n=", round(total_obs), ")") ) +
theme_minimal(base_size = 11) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = guide_legend(title = str_wrap(
get_label(Germany$D1a_AGEGROUPS_75, width = 65)
)))
When trying to build my loop, I came up with the following code using a nested-for-loop. When I finally managed to get the code running through, the loop variable names in the ggplot labels and the by_group commands were being ignored and hence the plots do not look like the individually created one at all. I did read a lot about symlinks, strings and similar things and tried to adjust my code, but the code fails at the 'group_by' command.
Here is my current loop code. I hope you can help me finding out what is wrong with it. Please let me know if I can provide you with any additional information.
# Define variables to loop over
variables <- c("Q6", "Q7", "Q8", "Q9", "Q10_1", "Q10_2", "Q10_3",
"Q10_4", "Q5_1", "Q5_2", "Q5_3", "Q5_4",
"Q5_5", "Q5_6", "Q5_7", "Q5_8",
"Q5_9", "Q5_10","Q5_11","Q5_12","Q5_13","Q5_14")
# Define grouping variables
grouping_vars <- c("D1a_AGEGROUPS_75",
"D2_GENDER_BINARY",
"Q14",
"Q20",
"Q42")
# Loop over variables and grouping variables
for(var in variables){
for(group_var in grouping_vars){
# Compute weighted counts by answer and age group
counts <- Germany %>%
filter(!is.na(var)) %>%
group_by(var, group_var) %>%
summarise(weighted_count = sum(Weight)) %>%
ungroup()
# Compute weighted percentages by answer and age group and
# total number of observations and percentage for each answer option
total_obs <- sum(Germany$Weight)
percentages <- counts %>%
group_by(var) %>%
mutate(total_weight = sum(weighted_count),
weighted_pct = weighted_count / total_weight * 100,
total_weighted_pct = sum(weighted_count)/total_obs*100) %>%
ungroup()
# The plot itself
p <- ggplot(percentages, aes(x = var, y = weighted_pct, fill = group_var)) +
geom_col() +
scale_fill_brewer(name = var_labels(group_var), palette = "Set2") +
# Add text labels with percentage for each age group on top of bars
geom_text(aes(label = round(weighted_pct), group = group_var),
position = position_stack(vjust = 0.5),
size=3, color="white") +
# Add text label with percentage for each answer option above bars
geom_text(aes(x=var, y=100+2.5,
label=paste0(round(total_weighted_pct),"%")),
size=3) +
# Graphic titles
labs(x = wrap_label(var_labels(Germany[[var]]), width=60),
y="Weighted percentage",
title=str_wrap(get_label(Germany[[var]])),
subtitle=paste0("Germany (n=", round(total_obs), ")")) +
theme_minimal(base_size=11) +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
guides(fill=guide_legend(title=str_wrap(get_label(Germany[[group_var]] ), width=65)))
# Save plot as png file with appropriate name based on variable and grouping variable names
ggsave(filename=paste0("Output/Plots/",
var, "_", group_var, ".png"),
plot=p,
dpi=300,
height=4,
width=7,
units='in')
}
}
The issue is that your loop counters var
and group_var
are character strings. And simply using these character strings in dplyr
verbs or in ggplot2
as if these are unquoted column names will not work. Instead you have to tell dplyr
and/or ggplot2
that these character strings are names of columns in your dataset which you could achieve by wrapping in the .data
pronoun, i.e. use e.g. .data[[var]]
instead of just var
:
library(dplyr)
library(ggplot2)
library(stringr)
library(sjlabelled)
variables <- c(
"Q6", "Q5_3"
)
grouping_vars <- c(
"D1a_AGEGROUPS_75"
)
for (var in variables) {
for (group_var in grouping_vars) {
counts <- Germany %>%
filter(!is.na(.data[[var]])) %>%
group_by(.data[[var]], .data[[group_var]]) %>%
summarise(weighted_count = sum(Weight)) %>%
ungroup()
total_obs <- sum(Germany$Weight)
percentages <- counts %>%
group_by(.data[[var]]) %>%
mutate(
total_weight = sum(weighted_count),
weighted_pct = weighted_count / total_weight * 100,
total_weighted_pct = sum(weighted_count) / total_obs * 100
) %>%
ungroup()
p <- ggplot(percentages, aes(x = .data[[var]], y = weighted_pct, fill = .data[[group_var]])) +
geom_col() +
scale_fill_brewer(name = var_labels(Germany[[group_var]]), palette = "Set2") +
geom_text(aes(label = round(weighted_pct), group = .data[[group_var]]),
position = position_stack(vjust = 0.5),
size = 3, color = "white"
) +
geom_text(
aes(
y = 100 + 2.5,
label = paste0(round(total_weighted_pct), "%")
),
size = 3
) +
labs(
x = wrap_label(var_labels(Germany[[var]]), width = 60),
y = "Weighted percentage",
title = str_wrap(get_label(Germany[[var]])),
subtitle = paste0("Germany (n=", round(total_obs), ")")
) +
theme_minimal(base_size = 11) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
}