Match boxplot and labels colors according Tukey's significance letters in ggplot

I am trying to match boxplot and labels colors according Tukey's significance letters in ggplot2 or ggboxplot

I don't know how to do it automatically or in a more elegant way using terrain.colors for example.

I have done it manually only to show what is my desired plot with boxplot and labels with the same colors as the Tukey's significance letters:

What I mean, is to have the "a", "b" and so on boxplots with the same color, both boxplots and letters. Something like this but using ggplot https://r-graph-gallery.com/84-tukey-test_files/figure-html/unnamed-chunk-3-1.png

your help will be very appreciated

Here is the script based on the accepted answer of this post: Is there a function to add AOV post-hoc testing results to ggplot2 boxplot?

library(plyr)
library(ggplot2)
library(multcompView)

set.seed(0)
lev <- gl(3, 10)
y <- c(rnorm(10), rnorm(10) + 0.1, rnorm(10) + 3)
d <- data.frame(lev=lev, y=y)

a <- aov(y~lev, data=d)
tHSD <- TukeyHSD(a, ordered = FALSE, conf.level = 0.95)

generate_label_df <- function(HSD, flev){
  # Extract labels and factor levels from Tukey post-hoc 
  Tukey.levels <- HSD[[flev]][,4]
  Tukey.labels <- multcompLetters(Tukey.levels)['Letters']
  plot.labels <- names(Tukey.labels[['Letters']])
  
  # Get highest quantile for Tukey's 5 number summary and add a bit of space to buffer between    
  # upper quantile and label placement
  boxplot.df <- ddply(d, flev, function (x) max(fivenum(x$y)) + 0.2)
  
  # Create a data frame out of the factor levels and Tukey's homogenous group letters
  plot.levels <- data.frame(plot.labels, labels = Tukey.labels[['Letters']],
                            stringsAsFactors = FALSE)
  
  # Merge it with the labels
  labels.df <- merge(plot.levels, boxplot.df, by.x = 'plot.labels', by.y = flev, sort = FALSE)
  
  return(labels.df)
}

#Generate ggplot

ggplot(d, aes(x=lev, y=y)) + geom_boxplot(fill = c("green", "green", "orange")) +
   geom_text(data = generate_label_df(tHSD, 'lev'), colour = c("green","orange", "green"), aes(x = plot.labels, y = V1, label = labels )) +
   scale_colour_manual(values=c("green", "green", "orange"))

Solution

Does this work for you? Find my comments below.

library(plyr)
library(ggplot2)
library(multcompView)

set.seed(0)
lev <- gl(3, 10)
y <- c(rnorm(10), rnorm(10) + 0.1, rnorm(10) + 3)
d <- data.frame(lev=lev, y=y)

a <- aov(y~lev, data=d)
tHSD <- TukeyHSD(a, ordered = FALSE, conf.level = 0.95)

generate_label_df <- function(HSD, flev){
  # Extract labels and factor levels from Tukey post-hoc 
  Tukey.levels <- HSD[[flev]][,4]
  Tukey.labels <- multcompLetters(Tukey.levels)['Letters']
  plot.labels <- names(Tukey.labels[['Letters']])
  
  # Get highest quantile for Tukey's 5 number summary and add a bit of space to buffer between    
  # upper quantile and label placement
  boxplot.df <- ddply(d, flev, function (x) max(fivenum(x$y)) + 0.2)
  
  # Create a data frame out of the factor levels and Tukey's homogenous group letters
  plot.levels <- data.frame(plot.labels, labels = Tukey.labels[['Letters']],
                            stringsAsFactors = FALSE)
  
  # Merge it with the labels
  labels.df <- merge(plot.levels, boxplot.df, by.x = 'plot.labels', by.y = flev, sort = FALSE)
  
  return(labels.df)
}

#############################
### new stuff starts here ###
#############################

label_df <- generate_label_df(tHSD, 'lev')
label_df$lev <- label_df$plot.labels

#Generate ggplot
lev_cols <- c("1" = "green", "2" = "green", "3" = "orange")

ggplot(d, aes(x = lev, y = y)) + 
  geom_boxplot(aes(fill = lev)) +
  geom_text(
    data = label_df,
    aes(
      x = plot.labels, 
      y = V1, 
      label = labels, 
      color = lev
    )
  ) +
  scale_color_manual(values = lev_cols) +
  scale_fill_manual(values = lev_cols)

^{Created on 2022-10-14 with reprex v2.0.2}

As you can see, you can tell different geoms_ in their aes() (!) that they should be colored according to e.g. the lev column. After doing that, you can define which of the levels in lev should have which color via a named vector c("Levelname1" = "Colorname1", ...) as we have here with lev_cols and provide it to scale_color_manual().

In this specific example, it was a bit more complex, because for geom_boxplot() we actually want different fill, while for geom_text() we want different color and thus we need both scale_color_manual() and scale_fill_manual(). Furthermore, the data you supply to the geom_text() does not have a column named lev, but I actually just made sure it does to keep it simple.

Bonus

FYI, you may also find the following alternative approach to get the compact letters display, as well as the alternative way to plot the results interesting. There's more on this here.

# extra -------------------------------------------------------------------
library(tidyverse)
library(emmeans)
library(multcomp)
library(multcompView)

set.seed(0)
lev <- gl(3, 10)
y <- c(rnorm(10), rnorm(10) + 0.1, rnorm(10) + 3)
d <- data.frame(lev = lev, y = y)


# This also gets you the letters ------------------------------------------
# fit model
model <- lm(y ~ lev, data = d)

# get (adjusted) y means per group
model_means <- emmeans(object = model,
                       specs = "lev")

# add letters to each mean
model_means_cld <- cld(object = model_means,
                       adjust = "Tukey",
                       Letters = letters,
                       alpha = 0.05)
#> Note: adjust = "tukey" was changed to "sidak"
#> because "tukey" is only appropriate for one set of pairwise comparisons
# show output
model_means_cld
#>  lev emmean    SE df lower.CL upper.CL .group
#>  2   -0.262 0.283 27   -0.982    0.457  a    
#>  1    0.359 0.283 27   -0.361    1.079  a    
#>  3    3.069 0.283 27    2.350    3.789   b   
#> 
#> Confidence level used: 0.95 
#> Conf-level adjustment: sidak method for 3 estimates 
#> P value adjustment: tukey method for comparing a family of 3 estimates 
#> significance level used: alpha = 0.05 
#> NOTE: If two or more means share the same grouping letter,
#>       then we cannot show them to be different.
#>       But we also did not show them to be the same.


# You may also like this plot ---------------------------------------------
ggplot() +
  # general layout
  theme_classic() +
  theme(plot.caption = ggtext::element_textbox_simple()) +
  # black data points
  geom_point(
    data = d,
    aes(y = y, x = lev),
    shape = 16,
    alpha = 0.5,
    position = position_nudge(x = -0.2)
  ) +
  # black boxplot
  geom_boxplot(
    data = d,
    aes(y = y, x = lev),
    width = 0.05,
    outlier.shape = NA,
    position = position_nudge(x = -0.1)
  ) +
  # red mean value
  geom_point(
    data = model_means_cld,
    aes(y = emmean, x = lev),
    size = 2,
    color = "red"
  ) +
  # red mean errorbar
  geom_errorbar(
    data = model_means_cld,
    aes(ymin = lower.CL, ymax = upper.CL, x = lev),
    width = 0.05,
    color = "red"
  ) +
  # red letters
  geom_text(
    data = model_means_cld,
    aes(
      y = emmean,
      x = lev,
      label = str_trim(.group)
    ),
    position = position_nudge(x = 0.1),
    hjust = 0,
    color = "red"
  ) +
  # caption
  labs(
    caption = "Black dots represent raw data. Red dots and error bars represent (estimated marginal) means ± 95% confidence interval per group. Means not sharing any letter are significantly different by the Tukey-test at the 5% level of significance."
  )

^{Created on 2022-10-14 with reprex v2.0.2}