rggplot2parallel-coordinates

How to create multiple y axis in ggplot2 (one for each variable)


I would like to draw a plot similar to a Parallel Coordinates Plot for the descriptive statistics. I want to plot the mean and standard deviation for each variable stratified by sex.

enter image description here

Unfortunately, I couldnt figure out a way to create an own y axis for each variable.

It should look similar to this plot, but with the means and standard deviations instead of each row.

enter image description here

In this post, they created the other plot, but I wasnt able to adjust it to my needs and my data, so that mean and standard deviation are depicted as in the plot above.

This is my data and code:


library(dplyr)
library(tidyr)
library(ggplot2)

my_data <- data.frame(
  sex = c("m", "w", "m", "w", "m", "w"),
  age = c(25, 30, 22, 35, 28, 46),
  testosterone = c(450, 200, 400, 300, 500, 350),
  cognition = c(75, 80, 70, 85, 78, 90),
  estrogen = c(20, 40, 15, 50, 10, 45)
)

numeric_vars <- c("age", "testosterone", "cognition", "estrogen")

# Calculate means and standard deviations for each sex and each variable
df_means <- my_data %>%
  group_by(sex) %>%
  summarise(across(all_of(numeric_vars), mean, na.rm = TRUE))

df_sds <- my_data %>%
  group_by(sex) %>%
  summarise(across(all_of(numeric_vars), sd, na.rm = TRUE))

# Reshape data for ggplot
df_melted <- reshape2::melt(df_means, id.vars = 'sex')
df_sds_melted <- reshape2::melt(df_sds, id.vars = 'sex')

# Combine means and standard deviations
df_combined <- merge(df_melted, df_sds_melted, by = c('sex', 'variable'))
names(df_combined)[names(df_combined) == 'value.x'] <- 'mean'
names(df_combined)[names(df_combined) == 'value.y'] <- 'sd'

# Create a parallel coordinates plot without normalization
ggplot(df_combined, aes(x = variable, y = mean, group = sex, color = sex)) +
  geom_line() +
  geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = sex), alpha = 0.2) +
  scale_color_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
  scale_fill_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
  theme_minimal() 



Solution

  • Something like this?

    The only "constants" in this are:

    I opted to set the levels of the variable factor to match the order of variables in your starting plot (which did not match your code), feel free to unset or change levels=.

    library(dplyr)
    library(tidyr)
    library(ggplot2)
    
    long_data <- my_data %>%
      summarize(
        across(all_of(numeric_vars), list(mu = mean, sd = sd, min = min, max = max)),
        .by = sex
      ) %>%
      pivot_longer(-sex, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value")) %>%
      mutate(
        variable = factor(variable, levels = c("age", "testosterone", "cognition", "estrogen")),
        x = as.numeric(variable)
      ) %>%
      mutate(
        min = min(min), max = max(max),
        mu_01 = scales::rescale(mu, from = c(min[1], max[1]), to = 0:1),
        sd_01 = sd / abs(min[1] - max[1]),
        .by = variable
      )
    ticks <- summarize(long_data, .by = c(variable, x),
                       y = pretty(c(min, max), n = 8),
                       y_01 = scales::rescale(y, from = c(min[1], max[1]), to = 0:1))
    tick_ends <- c(-0.05, 0)
    
    ggplot(long_data, aes(x, mu_01, group = sex, color = sex)) +
      geom_segment(
        aes(x = x + tick_ends[1], xend = x + tick_ends[2], y = y_01, yend = y_01),
        data = ticks, inherit.aes = FALSE
      ) +
      geom_line(
        aes(x = x, y = y_01, group = variable),
        data = summarize(ticks, .by = variable, x = x[1] + tick_ends[2], y_01 = range(y_01)),
        inherit.aes = FALSE
      ) +
      geom_ribbon(aes(ymin = mu_01 - sd_01, ymax = mu_01 + sd_01, fill = sex), alpha = 0.2) +
      geom_line() +
      geom_text(
        aes(x = x + tick_ends[1], y = y_01, label = y),
        hjust = 1.2, data = ticks, inherit.aes = FALSE
      ) +
      scale_x_continuous(
        name = NULL,
        breaks = seq_along(levels(long_data$variable)),
        labels = levels(long_data$variable),
        limits = range(as.numeric(long_data$variable)) + c(-0.1, 0.1),
        minor_breaks = seq(1, 10, 1),
        position = "top"
      ) +
      scale_y_continuous(name = NULL, breaks = NULL) +
      scale_color_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
      scale_fill_manual(values = c("m" = "#007BC3", "w" = "#EA5451")) +
      theme_minimal()
    

    enter image description here

    The use of .by= in the dplyr verbs requires dplyr_1.1.0 or later; if you have an older version of dplyr, remove the .by=c(..) and add corresponding group_by(..) before the respective verb.

    One possible (low-grade?) "risk" of scaling all variables to [0,1] based on their observed values is that it can hide or inflate that inferred relative magnitude/importance between the variables. Not sure there's an easy way to address this here.


    Extension

    To make the inner axes align, I don't know if there's a better way than to control the rescaling in a way like this. Now using new data:

    my_data <- data.frame( sex = c("m", "w", "m", "w", "m", "w"), age = c(25, 30, 22, 35, 28, 46), testosterone = c(450, 5, 400, 10, 500, 15), cognition = c(0.1, 0.5, 0.3, 0.6, 0.4, 0.1), estrogen = c(20, 80, 15, 60, 10, 70), Var6 = c(1000, 900, 600, 800, 700, 500), Var7 = c(3, 5, 6, 7, 3, 4), Var8 = c(20, 15, 30, 35, 25, 60), Var9 = c(0.6, 0.9, 1.4, 2.1, 1.2, 0.6), Var10 = c(200, 150, 300, 200, 130, 400), Var11 = c(900, 800, 450, 600, 650, 750), Var12 = c(30, 45, 60, 40, 30, 45) ) 
    numeric_vars <- setdiff(names(my_data), "sex")
    

    Using that and slightly-updated code:

    extend_rescale <- function(x, name = "variable") {
      p <- pretty(range(x))
      d <- diff(p)[1]
      p <- c(p[1], p[length(p)]) + d * c(-1, 1)
      out <- data.frame(
        x = scales::rescale(x, from = p, to = 0:1),
        step = d, min = p[1], max = p[2]
      )
      names(out)[1] <- name
      out
    }
    
    long_data <- my_data %>%
      summarize(
        across(all_of(numeric_vars), list(mu = mean, sd = sd, min = min, max = max)),
        .by = sex
      ) %>%
      pivot_longer(-sex, names_pattern = "(.*)_(.*)", names_to = c("variable", ".value")) %>%
      mutate(
        variable = factor(variable), # set levels= if desired
        x = as.numeric(variable)
      ) %>%
      mutate(
        extend_rescale(c(range(c(mu + sd, mu - sd)), mu), name = "mu_01")[-(1:2),],
        sd_01 = sd / abs(max[1] - min[1]),
        .by = variable)
    ticks <- summarize(long_data, .by = c(variable, x),
                       y = seq(min[1], max[2], by = step[1]),
                       y_01 = scales::rescale(y, from = c(min[1], max[1]), to = 0:1))
    tick_ends <- c(-0.05, 0)
    

    (Same plot code.)

    enter image description here