I would like to draw a plot similar to a Parallel Coordinates Plot for the descriptive statistics. I want to plot the mean and standard deviation for each variable stratified by sex.
Unfortunately, I couldnt figure out a way to create an own y axis for each variable.
It should look similar to this plot, but with the means and standard deviations instead of each row.
In this post, they created the other plot, but I wasnt able to adjust it to my needs and my data, so that mean and standard deviation are depicted as in the plot above.
This is my data and code:
library(dplyr)
library(tidyr)
library(ggplot2)
my_data <- data.frame(
sex = c("m", "w", "m", "w", "m", "w"),
age = c(25, 30, 22, 35, 28, 46),
testosterone = c(450, 200, 400, 300, 500, 350),
cognition = c(75, 80, 70, 85, 78, 90),
estrogen = c(20, 40, 15, 50, 10, 45)
)
numeric_vars <- c("age", "testosterone", "cognition", "estrogen")
# Calculate means and standard deviations for each sex and each variable
df_means <- my_data %>%
group_by(sex) %>%
summarise(across(all_of(numeric_vars), mean, na.rm = TRUE))
df_sds <- my_data %>%
group_by(sex) %>%
summarise(across(all_of(numeric_vars), sd, na.rm = TRUE))
# Reshape data for ggplot
df_melted <- reshape2::melt(df_means, id.vars = 'sex')
df_sds_melted <- reshape2::melt(df_sds, id.vars = 'sex')
# Combine means and standard deviations
df_combined <- merge(df_melted, df_sds_melted, by = c('sex', 'variable'))
names(df_combined)[names(df_combined) == 'value.x'] <- 'mean'
names(df_combined)[names(df_combined) == 'value.y'] <- 'sd'
# Create a parallel coordinates plot without normalization
ggplot(df_combined, aes(x = variable, y = mean, group = sex, color = sex)) +
geom_line() +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = sex), alpha = 0.2) +
scale_color_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
scale_fill_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
theme_minimal()
Something like this?
The only "constants" in this are:
tick_ends
to define the offsets for the tick segments; andhjust = 1.2
, providing a little distance between the segments and right-side of each text label.pretty(n=8)
, for the approximate number of ticks to add) is also a magic-constant.)I opted to set the levels of the variable
factor to match the order of variables in your starting plot (which did not match your code), feel free to unset or change levels=
.
library(dplyr)
library(tidyr)
library(ggplot2)
long_data <- my_data %>%
summarize(
across(all_of(numeric_vars), list(mu = mean, sd = sd, min = min, max = max)),
.by = sex
) %>%
pivot_longer(-sex, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value")) %>%
mutate(
variable = factor(variable, levels = c("age", "testosterone", "cognition", "estrogen")),
x = as.numeric(variable)
) %>%
mutate(
min = min(min), max = max(max),
mu_01 = scales::rescale(mu, from = c(min[1], max[1]), to = 0:1),
sd_01 = sd / abs(min[1] - max[1]),
.by = variable
)
ticks <- summarize(long_data, .by = c(variable, x),
y = pretty(c(min, max), n = 8),
y_01 = scales::rescale(y, from = c(min[1], max[1]), to = 0:1))
tick_ends <- c(-0.05, 0)
ggplot(long_data, aes(x, mu_01, group = sex, color = sex)) +
geom_segment(
aes(x = x + tick_ends[1], xend = x + tick_ends[2], y = y_01, yend = y_01),
data = ticks, inherit.aes = FALSE
) +
geom_line(
aes(x = x, y = y_01, group = variable),
data = summarize(ticks, .by = variable, x = x[1] + tick_ends[2], y_01 = range(y_01)),
inherit.aes = FALSE
) +
geom_ribbon(aes(ymin = mu_01 - sd_01, ymax = mu_01 + sd_01, fill = sex), alpha = 0.2) +
geom_line() +
geom_text(
aes(x = x + tick_ends[1], y = y_01, label = y),
hjust = 1.2, data = ticks, inherit.aes = FALSE
) +
scale_x_continuous(
name = NULL,
breaks = seq_along(levels(long_data$variable)),
labels = levels(long_data$variable),
limits = range(as.numeric(long_data$variable)) + c(-0.1, 0.1),
minor_breaks = seq(1, 10, 1),
position = "top"
) +
scale_y_continuous(name = NULL, breaks = NULL) +
scale_color_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
scale_fill_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
theme_minimal()
The use of .by=
in the dplyr verbs requires dplyr_1.1.0
or later; if you have an older version of dplyr, remove the .by=c(..)
and add corresponding group_by(..)
before the respective verb.
One possible (low-grade?) "risk" of scaling all variables to [0,1]
based on their observed values is that it can hide or inflate that inferred relative magnitude/importance between the variables. Not sure there's an easy way to address this here.
To make the inner axes align, I don't know if there's a better way than to control the rescaling in a way like this. Now using new data:
my_data <- data.frame( sex = c("m", "w", "m", "w", "m", "w"), age = c(25, 30, 22, 35, 28, 46), testosterone = c(450, 5, 400, 10, 500, 15), cognition = c(0.1, 0.5, 0.3, 0.6, 0.4, 0.1), estrogen = c(20, 80, 15, 60, 10, 70), Var6 = c(1000, 900, 600, 800, 700, 500), Var7 = c(3, 5, 6, 7, 3, 4), Var8 = c(20, 15, 30, 35, 25, 60), Var9 = c(0.6, 0.9, 1.4, 2.1, 1.2, 0.6), Var10 = c(200, 150, 300, 200, 130, 400), Var11 = c(900, 800, 450, 600, 650, 750), Var12 = c(30, 45, 60, 40, 30, 45) )
numeric_vars <- setdiff(names(my_data), "sex")
Using that and slightly-updated code:
extend_rescale <- function(x, name = "variable") {
p <- pretty(range(x))
d <- diff(p)[1]
p <- c(p[1], p[length(p)]) + d * c(-1, 1)
out <- data.frame(
x = scales::rescale(x, from = p, to = 0:1),
step = d, min = p[1], max = p[2]
)
names(out)[1] <- name
out
}
long_data <- my_data %>%
summarize(
across(all_of(numeric_vars), list(mu = mean, sd = sd, min = min, max = max)),
.by = sex
) %>%
pivot_longer(-sex, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value")) %>%
mutate(
variable = factor(variable), # set levels= if desired
x = as.numeric(variable)
) %>%
mutate(
extend_rescale(c(range(c(mu + sd, mu - sd)), mu), name = "mu_01")[-(1:2),],
sd_01 = sd / abs(max[1] - min[1]),
.by = variable)
ticks <- summarize(long_data, .by = c(variable, x),
y = seq(min[1], max[2], by = step[1]),
y_01 = scales::rescale(y, from = c(min[1], max[1]), to = 0:1))
tick_ends <- c(-0.05, 0)
(Same plot code.)