I'm using ggplot2
to create a lollipop graph comparing US university tuitions and median household income (for all races and also for black households specifically). For the graph to be easier to read, I wanted to set line width and point size for the two bars for household income as 1.3 and 5, while setting the line width and point size for other bars (tuition and cost) as 0.7 and 2. However, for some reason, R applies my parameters to the bar for black households and the University of Florida (see image) instead of applying them to both income bars, but I have no idea how to fix it.
Also, the x and y labels that I set are not applied to the graph. Instead, there's a little namerank
hovering on the top left corner and a y
dallying on the bottom right corner. I also don't know how to make them go away.
Here is my code:
clg_fee|>
arrange(costatt)|>
mutate(namerank = factor(namerank, namerank))|>
ggplot() +
geom_segment(
aes(x=namerank,
xend=namerank,
y=0,
yend=costatt,
color = ifelse(namerank %in% c("Real Median Household Income (2022)",
"Real Median Household Income (Black, 2022)"),
"Median Household Income","Cost of Attendance (out-of-state)")),
linewidth = ifelse(clg_fee$namerank %in% c("Real Median Household Income (2022)",
"Real Median Household Income (Black, 2022)"),
1.3,0.7) #cost of attendance and income
)+
geom_segment(
aes(x=namerank,
xend=namerank,
y=0,
yend=out_state,
color = "Tuition (out-of-state)"),
linewidth = 0.7 #out_state tuition
)+
geom_point(aes(x = namerank,
y=out_state,
color="Tuition (out-of-state)"),
size = 2)+ #out_state tuition
geom_point(aes(x = namerank, y = costatt,
color = ifelse(namerank %in% c("Real Median Household Income (2022)",
"Real Median Household Income (Black, 2022)"),
"Median Household Income","Cost of Attendance (out-of-state)")),
size = ifelse(clg_fee$namerank %in% c("Real Median Household Income (2022)",
"Real Median Household Income (Black, 2022)"),
5, 2))+ #cost of attendance and income
geom_segment(
aes(x=namerank,
xend=namerank,
y=0,
yend=in_state,
color = "Tuition (in-state)"),
linewidth = 0.7 #in_state
)+
geom_point(aes(x = namerank, y=in_state, color = "Tuition (in-state)"), size = 2)+ #in_state
coord_flip() +
scale_y_continuous(labels = scales::label_number(scale_cut = scales::cut_short_scale(), suffix = "$"))+
scale_color_manual(
values = c(
"Tuition (out-of-state)" = "#779ECB",
"Tuition (in-state)" = "#77DD77",
"Median Household Income" = "orange",
"Cost of Attendance (out-of-state)" = "#757575"
)
)+
theme_ipsum()+
theme(legend.position = "top")+
labs(
xlab = "",
ylab = "Undergraduate costs and tuition",
color = "",
title = "University costs are far from affordable",
caption = "Tuition fees source: Visual Capitalist
Note that in-state tuition data is unavailable for most universities \n
Cost of attendance source: University websites
Note that official estimations of cost of attendance are unavailable for Boston College and Northeastern"
)
Here is my data:
Since I wanted to rank universities' cost of attendance in descending order and place household income in the same ranking, I inserted median income for the US as two rows in the dataframe and put the income values under costatt
(which stands for cost of attendance)
structure(list(namerank = c("Real Median Household Income (2022)",
"Real Median Household Income (Black, 2022)", "University of Southern California(Rank28)",
"Brown University(Rank9)", "Duke University(Rank7)", "University of Pennsylvania(Rank6)",
"Cornell University(Rank12)", "Northwestern University(Rank9)",
"University of Chicago(Rank12)", "Columbia University(Rank12)",
"Dartmouth College(Rank18)", "Georgetown University(Rank22)",
"Yale University(Rank5)", "Vanderbilt University(Rank18)", "Carnegie Mellon University(Rank24)",
"Johns Hopkins University(Rank9)", "California Institute of Technology(Rank7)",
"Washington University, St. Louis(Rank24)", "University of Notre Dame(Rank20)",
"Stanford University(Rank3)", "Emory University(Rank24)", "Massachusetts Institute of Technology(Rank2)",
"Princeton University(Rank1)", "Harvard University(Rank3)", "University of Virginia(Rank24)",
"Rice University(Rank17)", "University of Michigan, Ann Arbor(Rank21)",
"University of California, San Diego(Rank28)", "University of California, Berkeley(Rank15)",
"University of California, LA(Rank15)", "University of California, Davis(Rank28)",
"University of North Carolina at Chapel Hill(Rank22)", "University of Florida(Rank28)"
), rank = c(NA, NA, 28, 9, 7, 6, 12, 9, 12, 12, 18, 22, 5, 18,
24, 9, 7, 24, 20, 3, 24, 2, 1, 3, 24, 17, 21, 28, 15, 15, 28,
22, 28), school_name = c(NA, NA, "University of\r\r\r\nSouthern California",
"Brown University", "Duke University", "University of\r\r\r\nPennsylvania",
"Cornell University", "Northwestern University", "University of Chicago",
"Columbia University", "Dartmouth College", "Georgetown University",
"Yale University", "Vanderbilt University", "Carnegie Mellon University",
"Johns Hopkins\r\r\r\nUniversity", "California Institute\r\r\r\nof Technology",
"Washington\r\r\r\nUniversity, St. Louis", "University of Notre Dame",
"Stanford University", "Emory University", "Massachusetts\r\r\r\nInstitute of\r\r\r\nTechnology",
"Princeton University", "Harvard University", "University of Virginia",
"Rice University", "University of\r\r\r\nMichigan, Ann Arbor",
"University of\r\r\r\nCalifornia, San Diego", "University of\r\r\r\nCalifornia, Berkeley",
"University of\r\r\r\nCalifornia, LA", "University of\r\r\r\nCalifornia, Davis",
"University of North\r\r\r\nCarolina at Chapel Hill", "University of Florida"
), state = c(NA, NA, "California", "Rhode Island", "North Carolina",
"Pennsylvania", "New York", "Illinois", "Illinois", "New York",
"New Hampshire", "Washington, DC", "Connecticut", "Tennessee",
"Pennsylvania", "Maryland", "California", "Missouri", "Indiana",
"California", "Georgia", "Massachusetts", "New Jersey", "Massachusetts",
"Virginia", "Texas", "Michigan", "California", "California",
"California", "California", "North Carolina", "Florida"), out_state = c(NA,
NA, 68237, 68230, 66172, 66104, 66014, 65997, 65619, 65524, 65511,
65082, 64700, 63946, 63829, 63340, 63255, 62982, 62693, 62484,
60774, 60156, 59710, 59076, 58950, 58128, 57273, 48630, 48465,
46326, 46043, 39338, 28658), in_state = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 22323, NA, 17786, 16056, 15891, 13752, 15266, 8998,
6381), costatt = c(74580, 52860, 95225, 91676, 88938, 92228,
83296, 91290, 89040, 88942, 91312, 88782, 90975, 89590, 73000,
86065, 80028, 87644, 86125, 92892, 88414, 82720, 86700, 91166,
91440, 86279, 76294, 77886, 78582, 67959, 78996, 66372, 45808
)), class = "data.frame", row.names = c(NA, -33L))
At first I didn't have the linewidth
or size
arguments, and the plot was created without any issue. After I added the arguments, R started warning me that namerank
couldn't be found even though I thoroughly checked my piping. I added clg_fee$
in my size arguments -- hence size = ifelse(clg_fee$namerank etc. etc.)
and it fixed the problem, but now I have median income for black households and the University of Florida highlighted instead of both bars for median income.
For the labels, I tried setting xlab = NULL
in the labels argument, but that didn't work.
The issue is that you reordered the data before passing it to ggplot()
but set the linewidth
and size
using an ifelse
based on the original "unordered" dataset. Instead I would recommend to map on aesthetics and set your desired values for linewidth
and size
using either scale_xxx_identity
or scale_xxx_manual
as I do in the code below. Both approaches require slightly more work but are less error prone:
Note: Because of the long category labels I aligned both the title and the legend with the "plot"
(instead of the "panel"
) which at least for legend.location
requires ggplot2 >= 3.5.0
.
library(ggplot2)
library(dplyr, warn = FALSE)
library(hrbrthemes)
clg_fee |>
arrange(costatt) |>
mutate(namerank = factor(namerank, namerank)) |>
mutate(
costatt_or_income = ifelse(
grepl("^Real Median", namerank),
"Median Household Income", "Cost of Attendance (out-of-state)"
)
) |>
ggplot(aes(x = namerank, xend = namerank)) +
geom_segment(
aes(
y = 0,
yend = costatt,
color = costatt_or_income,
linewidth = costatt_or_income
)
) +
geom_point(
aes(
y = costatt,
color = costatt_or_income,
size = costatt_or_income
)
) +
geom_segment(
aes(
y = 0,
yend = out_state,
color = "Tuition (out-of-state)",
linewidth = "Tuition (out-of-state)"
)
) +
geom_point(
aes(
y = out_state,
color = "Tuition (out-of-state)",
size = "Tuition (out-of-state)"
)
) +
geom_segment(
aes(
y = 0,
yend = in_state,
color = "Tuition (in-state)",
linewidth = "Tuition (in-state)"
)
) +
geom_point(aes(
y = in_state, color = "Tuition (in-state)", size = "Tuition (in-state)"
)) +
coord_flip() +
scale_y_continuous(labels = scales::label_number(
scale_cut = scales::cut_short_scale(), suffix = "$"
)) +
scale_color_manual(
values = c(
"Tuition (out-of-state)" = "#779ECB",
"Tuition (in-state)" = "#77DD77",
"Median Household Income" = "orange",
"Cost of Attendance (out-of-state)" = "#757575"
)
) +
scale_linewidth_manual(
values = c(
"Median Household Income" = 1.3,
"Cost of Attendance (out-of-state)" = .7,
"Tuition (out-of-state)" = .7,
"Tuition (in-state)" = .7
),
guide = "none"
) +
scale_size_manual(
values = c(
"Median Household Income" = 5,
"Cost of Attendance (out-of-state)" = 2,
"Tuition (out-of-state)" = 2,
"Tuition (in-state)" = 2
),
guide = "none"
) +
theme_ipsum() +
theme(
legend.position = "top",
plot.title.position = "plot",
legend.location = "plot"
) +
labs(
xlab = "",
ylab = "Undergraduate costs and tuition",
color = NULL,
title = "University costs are far from affordable",
caption = "Tuition fees source: Visual Capitalist
Note that in-state tuition data is unavailable for most universities \n
Cost of attendance source: University websites
Note that official estimations of cost of attendance are unavailable for Boston College and Northeastern"
)