I have a loop that produces a list of ggplots for any given number of analytes (in this example, 3). An ANOVA is performed and p-value generated and mapped for each ggplot.
However, I also want to annotate these plots with Tukey HSD p-value brackets. If possible, I only want to visualize brackets that are below an adjusted p < 0.05.
I have written two scripts that perform separately these actions; the output for my loop is a list of lists (ggplot elements), while the output from my Tukey HSD script is a dataframe.
The brackets that I am referring to would look something like in this example ggplot:
The following is my code. First, the data:
set.seed(10)
Label<-as.data.frame(c("Baseline","Baseline","Baseline","Baseline","A","B","B","C","D"))
Label<-do.call("rbind", replicate(10, Label, simplify = FALSE))
colnames(Label)<-"Label"
Analyte1<- rnorm(90, mean = 1, sd = 1)
Analyte2<- rnorm(90, mean = 1, sd = 1)
Analyte3<- rnorm(90, mean = .2, sd = 1)
df<-cbind(Label,Analyte1,Analyte2,Analyte3)
The following is my ANOVA loop:
library(dplyr)
library(ggplot2)
library(ggpubr)
library(rstatix)
library(rlist)
# Select numeric columns to obtain df length.
sample_list <-
colnames(select_if(df, is.numeric))
sample_list
# Create a list where the plots will be saved.
ANOVA.plots <- list()
# The for loop.
for (i in 1:length(sample_list)) {
ANOVA.ggplot <-
ggplot(
df,
aes_string(
x = "Label",
y = sample_list[i],
fill = "Label",
title = sample_list[i],
outlier.shape = NA
)
) +
border(color = "black", size = 2.5) +
geom_jitter(
aes(fill = Label),
shape = 21,
size = 4,
color = "black",
stroke = 1,
position = position_jitter()
) +
geom_boxplot(alpha = 0.7,
linetype = 1,
size = 1.1) +
theme(legend.position = "none") +
scale_fill_brewer(palette = "Set2") +
stat_boxplot(geom = "errorbar",
size = 1.1,
width = 0.4) +
theme (axis.title.y = element_blank()) +
theme (axis.title.x = element_blank()) +
stat_compare_means(
inherit.aes = TRUE,
data = df,
method = "anova",
paired = FALSE ,
method.args = list(var.equal = TRUE),
fontface = "bold.italic",
size = 5,
vjust = 0,
hjust = 0
)
ANOVA.plots[[i]] <- ANOVA.ggplot
}
# Print each ggplot element from the list of lists.
ANOVA.plots
Currently, this chunk generates plots that look similar to this one of Analyte 2.
Lastly, this chunk is for obtaining the Tukey HSD data frame.
TUKEY.length<-length(sample_list)
TUKEY.list <-
vector(mode = "list", length = TUKEY.length) # Empty list for looping, where Tukey HSD results are stored.
for (i in 1:TUKEY.length + 1) {
TUKEY.list[[i]] <-
aov(df[[i]] ~ df[[1]]) %>% tukey_hsd() # Obtain Tukey HSD p-values from comparing groups.
}
TUKEY.list <-
TUKEY.list[lengths(TUKEY.list) > 0L] # Remove empty lists, artifacts from piping.
p.value.threshold <- 0.05
TUKEY.df<-list.rbind(TUKEY.list)
Analytes <-
colnames(df[, sapply(df, class) %in% c('integer', 'numeric')])
# Stores the analyte names for Tukey. Only pulls numeric colnames.
# The following was done to properly bind the analyte IDs to the Tukey data frame.
Analytes.df<-as.data.frame(Analytes)
Analytes.df$Value<-c(1:nrow(Analytes.df))
Analytes.df<-do.call("rbind", replicate(10, Analytes.df, simplify = FALSE))
Analytes.df <-
Analytes.df[order(Analytes.df$Value), ]
# Orders the dataframe so that the rownames can be assigned to the TUKEY HSD.
Analytes.df<-as.data.frame(Analytes.df[,-2])
toDrop <- c("^term", "^null") # Columns to drop, left over from Tukey loop.
TUKEY.df<-TUKEY.df[,!grepl(paste(toDrop, collapse = "|"),names(TUKEY.df))]
TUKEY.df<-cbind(Analytes.df, TUKEY.df)
TUKEY.df<-filter(TUKEY.df, p.adj <= p.value.threshold)
The desired output would still show the ANOVA p-value, and have brackets for those groups who have an adjusted p-value below 0.05 following Tukey HSD.
In this example, only Analyte 2, groups A and B, has an adjusted p-value below 0.05 (i.e. p = 0.000754), therefore a bracket should appear above these two plots. However, the code for mapping these Tukey-HSD brackets should be able to work for more variables (i.e. 20+ Analytes) and capture all pairwise comparisons that are significant for each plot, not just the one I have given.
You could use ggsignif
for custom annotations. Example uses p < 0.2 as cutoff to show multiple error bars:
library(ggplot2)
library(ggsignif)
library(cowplot)
library(data.table)
set.seed(10)
df <- data.table(Label=rep(c("Baseline","Baseline","Baseline","Baseline","A","B","B","C","D"), 10),
`Analyte 1` = rnorm(90, mean = 1, sd = 1),
`Analyte 2` = rnorm(90, mean = 1, sd = 1),
`Analyte 3` = rnorm(90, mean = .2, sd = 1))
df[, Label := factor(Label, unique(Label))]
df <- melt(df, id.vars = "Label", variable.name = "Analyte")
dfl <- split(df, df$Analyte, drop=TRUE)
doPlots <- function(x, signif.cutoff=.05){
set.seed(123)
p1 <- ggplot(dfl[[x]], aes(x=Label, y=value, fill=Label)) +
geom_boxplot(alpha = 0.7, linetype = 1, size = 1.1) +
theme(legend.position = "none") +
scale_fill_brewer(palette = "Set2") +
stat_boxplot(geom = "errorbar", size = 1.1, width = 0.4) +
geom_jitter(aes(fill = Label), shape = 21, size = 4, stroke = 1) +
ggtitle(x)
a <- stats::TukeyHSD(stats::aov(value ~ Label, data = dfl[[x]]))[[1]]
a <- stats::setNames(
data.frame(
do.call(rbind, strsplit(rownames(a), "-")),
a[, "p adj"]
), c("Var1", "Var2", "p") )
a <- subset(a[stats::complete.cases(a),], p < signif.cutoff)
if (nrow(a) == 0) return(p1) else {
a$p <- formatC(signif(a$p, digits = 3),
digits = 3, format = "g", flag = "#")
keep.tests <- unname(t(apply(a[, -3], 1, sort)))
keep.tests <- unname(split(keep.tests, seq(dim(keep.tests)[1])))
ts <- list(a = a, keep.tests = keep.tests)
p1 + ggsignif::geom_signif(
comparisons=ts$keep.tests,
test="TukeyHSD",
annotations=ts$a$p,
step_increase=0.1)
}
}
plot_grid(plotlist=lapply(names(dfl), doPlots, signif.cutoff=.2), ncol=3)
Created on 2021-01-29 by the reprex package (v1.0.0)