I'm struggling to reproduce this graph that I built in Tableau.
I'm missing the Sub-totals, but think I can solve that problem. What's causing me the problem is sorting Transportation Mode by the SUM() of the overall measure. Feel like it should be handled dynamically, but I'm OK hard-coding to force it in the order that I want just to get over this hurdle and keep moving forward.
Here's what my R version looks like (disregard the forcats vertical thing on the left side, that's just junk created when I was trying to build the graphs for this post).
It's important that Personal Vehicles is on top and Pedestrians is second in each facet. When I add the subtotals into my R dataset, I will need it to be at the bottom of the facet. Not sure how to control subtotals if I can't even control the main field.
Here's the data behind the graph:
Border_Code | measure_summary | mode_color_code | annual_crossings | record_cnt |
---|---|---|---|---|
CAN | Personal Vehicles | #C14F22 | 20608230 | 973 |
CAN | Trucks | #34844A | 5526056 | 896 |
CAN | Pedestrians | #4E79A7 | 215983 | 347 |
CAN | Buses | #8074A8 | 48334 | 398 |
CAN | Trains | #848E93 | 20732 | 276 |
MEX | Personal Vehicles | #C14F22 | 75891766 | 312 |
MEX | Pedestrians | #4E79A7 | 39421651 | 329 |
MEX | Trucks | #34844A | 7356659 | 257 |
MEX | Buses | #8074A8 | 118719 | 179 |
MEX | Trains | #848E93 | 10772 | 84 |
Here's my R code to produce the dataset that does some of the early data wrangling before I aggregate to the CSV file. I don't think this is the problem, but I do create the measure_summary factor field in this section.
# Add some features to the BASE data frame. This is the BASE that all sub-queries (data frames) will pull from.
data_00_base_mod <- data_00_base %>%
mutate(Border_Code = ifelse(grepl("Canada", Border, fixed = TRUE), "CAN", "MEX")) %>%
separate(Date, c("Month", "Year"), remove = FALSE) %>%
# mutate(Measure = factor(Measure, order = TRUE, levels = c("Bus Passengers","Buses","Pedestrians","Personal Vehicle Passengers","Personal Vehicles","Rail Containers Empty","Rail Containers Loaded","Train Passengers","Trains","Truck Containers Empty","Truck Containers Loaded","Trucks")))
# mutate(Measure = factor(Measure, order = TRUE, levels = c("Personal Vehicles","Personal Vehicle Passengers","Pedestrians","Trucks","Truck Containers Loaded","Truck Containers Empty","Buses","Bus Passengers","Trains","Train Passengers","Rail Containers Loaded","Rail Containers Empty")))
mutate(measure_summary = factor(Measure, order = TRUE, levels = c("Personal Vehicles","Personal Vehicle Passengers","Pedestrians","Trucks","Truck Containers Loaded","Truck Containers Empty","Buses","Bus Passengers","Trains","Train Passengers","Rail Containers Loaded","Rail Containers Empty"))) %>%
mutate(Measure = factor(Measure, order = TRUE, levels = c("Trains","Train Passengers","Rail Containers Loaded","Rail Containers Empty","Buses","Bus Passengers","Trucks","Truck Containers Loaded","Truck Containers Empty","Pedestrians","Personal Vehicles","Personal Vehicle Passengers"))) %>%
# mutate(Year_num) = as.integer(Year)
mutate(state_delete_me = State)
data_00_base_mod$Year_num <- as.integer(data_00_base_mod$Year)
# Augment with a dimension table to help control colors and sorting in the presentation tier
# This is all stuff I decided on when building the Tableau dashboard. I want to follow that lead with R to make it easier to compare/contrast output side-by-side
dim_measure <- data.frame(
Measure=c("Bus Passengers","Buses","Pedestrians","Personal Vehicle Passengers","Personal Vehicles","Rail Containers Empty","Rail Containers Loaded","Train Passengers","Trains","Truck Containers Empty","Truck Containers Loaded","Trucks")
,mode_main_sort=c(8,7,3,2,1,11,10,12,9,6,5,4)
,mode_color_code=c("#C799BC","#8074A8","#4E79A7","#F59C3C","#C14F22","#CDCECD","#5B6570","#89C8CC","#848E93","#F4D166","#B2C25B","#34844A")
)
# Left Join the Base table with supplemental information about the Measures to help with the presentation layer
data_00_base_mod <- merge(x = data_00_base_mod, y = dim_measure, by = "Measure", all.x = TRUE)
And this is the block of code that builds the dataset for the graph (and the csv attachment).
# A second pass at this using a secondary dim field, measure_summary
data_10b_core <- data_00_base_mod %>%
filter(measure_summary == "Personal Vehicles" | measure_summary == "Pedestrians" | measure_summary == "Trucks" | measure_summary == "Buses" | measure_summary =="Trains") %>%
filter(Year == "2023") %>%
select(Border_Code, measure_summary, mode_color_code, Value) %>%
group_by(Border_Code, measure_summary, mode_color_code) %>%
summarise(annual_crossings = sum(Value), record_cnt = n(), .groups = 'keep') %>%
arrange(Border_Code, - annual_crossings)
gg_10b_year_summary <- ggplot(data_10b_core) +
geom_col(aes(x=measure_summary, y=annual_crossings, fill = mode_color_code)) +
geom_text(aes(x=measure_summary, y=annual_crossings, label = paste(round(annual_crossings / 1e6, 2), "M")), vjust = 0.5, hjust = -0.2) +
coord_flip() +
scale_fill_identity() +
facet_grid(vars(Border_Code, )) +
theme_minimal() +
theme(axis.text.x = element_text(angle=0),
panel.background = element_blank(),
strip.background = element_rect(colour="gray50", fill="gray90"),
panel.border = element_rect(colour = "gray50", fill=NA, size=1)) +
scale_y_continuous(name = "Inbound Border Crossings",
labels = unit_format(unit = "M", scale = 1e-6)
) +
labs(title = "Border Crossings by Mode and Country in 2023")
plot(gg_10b_year_summary)
Many thanks for any help you can provide.
Cheers!
By default the categories are ordered alphabetically. If you want a specific order you have to convert to a factor
with your desired order. And if you want to reorder based on the value of a numeric variable you could do so using reorder
. By default reorder
will reorder based on the mean but you could change that by setting FUN=sum
to reorder based on the sum:
Note: I switched x
and y
to get rid of the coord_flip
and used geom_label
(with fill=NA
and label.size=0
) instead of geom_text
to add some padding around the labels.
library(ggplot2)
library(scales)
ggplot(data_10b_core, aes(
x = annual_crossings,
y = reorder(measure_summary, annual_crossings, FUN = sum)
)) +
geom_col(
aes(fill = mode_color_code)
) +
geom_label(
aes(
label = paste(round(annual_crossings / 1e6, 2), "M")
),
hjust = 0,
fill = NA,
label.size = 0
) +
scale_fill_identity() +
facet_grid(vars(Border_Code)) +
theme_minimal() +
theme(
axis.text.y = element_text(angle = 0),
panel.background = element_blank(),
strip.background = element_rect(colour = "gray50", fill = "gray90"),
panel.border = element_rect(colour = "gray50", fill = NA, size = 1)
) +
scale_x_continuous(
name = "Inbound Border Crossings",
labels = unit_format(unit = "M", scale = 1e-6)
) +
labs(title = "Border Crossings by Mode and Country in 2023")
#> Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
DATA
data_10b_core <- structure(list(Border_Code = c(
"CAN", "CAN", "CAN", "CAN", "CAN",
"MEX", "MEX", "MEX", "MEX", "MEX"
), measure_summary = c(
"Personal Vehicles",
"Trucks", "Pedestrians", "Buses", "Trains", "Personal Vehicles",
"Pedestrians", "Trucks", "Buses", "Trains"
), mode_color_code = c(
"#C14F22",
"#34844A", "#4E79A7", "#8074A8", "#848E93", "#C14F22", "#4E79A7",
"#34844A", "#8074A8", "#848E93"
), annual_crossings = c(
20608230,
5526056, 215983, 48334, 20732, 75891766, 39421651, 7356659, 118719,
10772
), record_cnt = c(
973, 896, 347, 398, 276, 312, 329, 257,
179, 84
)), class = "data.frame", row.names = c(NA, -10L))