rggplot2p-valuet-test

How to perform t test and plot p-values for comparison between groups on a grouped boxplot (ggplot)?


I have a data frame, as shown below:

> dput(filtered_lymph)
structure(list(cluster = c("CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", 
"CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", 
"CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", 
"CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", 
"CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", 
"CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD4+ Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "CD8+ Effector Tcells", 
"CD8+ Effector Tcells", "CD8+ Effector Tcells", "NK Cells", "NK Cells", 
"NK Cells", "NK Cells", "NK Cells", "NK Cells", "NK Cells", "NK Cells", 
"NK Cells", "NK Cells", "NK Cells", "NK Cells", "NK Cells", "NK Cells", 
"NK Cells", "NK Cells", "NK Cells", "NK Cells", "NK Cells", "NK Cells", 
"NK Cells", "NK Cells", "NK Cells", "NK Cells", "NK Cells", "NK Cells", 
"NK Cells", "Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells", 
"Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells", 
"Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells", 
"Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells", 
"Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells", 
"Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells", 
"Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells", 
"Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells", 
"Progenitor Tcells", "Progenitor Tcells", "Progenitor Tcells"
), condition = c("B16_NTX_1", "B16_NTX_2", "B16_NTX_3", "B16_NTX_4", 
"KPC_NTX_1", "KPC_NTX_2", "KPC_NTX_3", "KPC_NTX_4", "KPC_NTX_5", 
"B16_aPD1_1", "B16_aPD1_2", "B16_aPD1_3", "B16_aPD1_4", "KPC_aPD1_1", 
"KPC_aPD1_2", "KPC_aPD1_4", "KPC_aPD1_5", "B16_AC484_1", "B16_AC484_2", 
"B16_AC484_3", "B16_AC484_4", "KPC_AC484_1", "KPC_AC484_2", "KPC_AC484_3", 
"KPC_AC484_4", "KPC_AC484_5", "KPC_AC484_6", "B16_NTX_1", "B16_NTX_2", 
"B16_NTX_3", "B16_NTX_4", "KPC_NTX_1", "KPC_NTX_2", "KPC_NTX_3", 
"KPC_NTX_4", "KPC_NTX_5", "B16_aPD1_1", "B16_aPD1_2", "B16_aPD1_3", 
"B16_aPD1_4", "KPC_aPD1_1", "KPC_aPD1_2", "KPC_aPD1_4", "KPC_aPD1_5", 
"B16_AC484_1", "B16_AC484_2", "B16_AC484_3", "B16_AC484_4", "KPC_AC484_1", 
"KPC_AC484_2", "KPC_AC484_3", "KPC_AC484_4", "KPC_AC484_5", "KPC_AC484_6", 
"B16_NTX_1", "B16_NTX_2", "B16_NTX_3", "B16_NTX_4", "KPC_NTX_1", 
"KPC_NTX_2", "KPC_NTX_3", "KPC_NTX_4", "KPC_NTX_5", "B16_aPD1_1", 
"B16_aPD1_2", "B16_aPD1_3", "B16_aPD1_4", "KPC_aPD1_1", "KPC_aPD1_2", 
"KPC_aPD1_4", "KPC_aPD1_5", "B16_AC484_1", "B16_AC484_2", "B16_AC484_3", 
"B16_AC484_4", "KPC_AC484_1", "KPC_AC484_2", "KPC_AC484_3", "KPC_AC484_4", 
"KPC_AC484_5", "KPC_AC484_6", "B16_NTX_1", "B16_NTX_2", "B16_NTX_3", 
"B16_NTX_4", "KPC_NTX_1", "KPC_NTX_2", "KPC_NTX_3", "KPC_NTX_4", 
"KPC_NTX_5", "B16_aPD1_1", "B16_aPD1_2", "B16_aPD1_3", "B16_aPD1_4", 
"KPC_aPD1_1", "KPC_aPD1_2", "KPC_aPD1_4", "KPC_aPD1_5", "B16_AC484_1", 
"B16_AC484_2", "B16_AC484_3", "B16_AC484_4", "KPC_AC484_1", "KPC_AC484_2", 
"KPC_AC484_3", "KPC_AC484_4", "KPC_AC484_5", "KPC_AC484_6"), 
    `total cells` = c(1.705589084, 1.414617716, 2.009911894, 
    0.857632933, 1.263689975, 1.606186794, 2.006018054, 1.449275362, 
    1.286863271, 4.41495778, 5.954825462, 4.0544723, 4.275011018, 
    1.063829787, 2.304330552, 0.997592019, 1.957585644, 0.386100386, 
    0.500294291, 0.855880729, 0.876460768, 1.538118591, 1.736208345, 
    1.583467525, 1.176075269, 2.027027027, 1.645692159, 14.51062713, 
    4.479622769, 12.61013216, 7.261292167, 3.959561921, 12.55205235, 
    6.218655968, 14.29512516, 3.699731903, 4.680337756, 3.661875428, 
    7.644692046, 6.63287792, 1.968085106, 5.522447358, 10.49191606, 
    3.996737357, 8.273579702, 11.65391407, 12.64494754, 6.385642738, 
    20.17387428, 20.61047326, 22.49060655, 17.37231183, 13.73873874, 
    15.32752501, 3.332458672, 6.635230717, 2.450440529, 3.687821612, 
    4.085930918, 5.770374777, 3.911735206, 5.928853755, 2.252010724, 
    6.899879373, 7.289527721, 4.178272981, 6.170118995, 6.64893617, 
    3.535955503, 6.329549364, 3.752039152, 6.784335356, 5.974102413, 
    7.482054114, 7.429048414, 10.90057958, 9.633155979, 11.54052603, 
    10.68548387, 14.99356499, 16.16650532, 5.510364734, 2.72819131, 
    3.716960352, 5.431675243, 4.886267902, 6.543723974, 6.519558676, 
    5.401844532, 23.21715818, 3.908323281, 3.867214237, 2.69266481, 
    2.732481269, 1.861702128, 7.151370679, 7.258341933, 4.159869494, 
    3.281853282, 3.943496174, 4.389839867, 2.796327212, 4.257690593, 
    5.516661999, 4.911433172, 6.686827957, 8.719433719, 6.808647951
    ), New_Condition = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), levels = c("NTX", 
    "Anti-PD-1", "AC484"), class = "factor"), Tissue_Expression = c("Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", "Lymphoid", 
    "Lymphoid", "Lymphoid")), row.names = c(NA, -108L), class = c("tbl_df", 
"tbl", "data.frame"))

> str(filtered_lymph)
tibble [108 × 5] (S3: tbl_df/tbl/data.frame)
 $ cluster          : chr [1:108] "CD4+ Tcells" "CD4+ Tcells" "CD4+ Tcells" "CD4+ Tcells" ...
 $ condition        : chr [1:108] "B16_NTX_1" "B16_NTX_2" "B16_NTX_3" "B16_NTX_4" ...
 $ total cells      : num [1:108] 1.706 1.415 2.01 0.858 1.264 ...
 $ New_Condition    : Factor w/ 3 levels "NTX","Anti-PD-1",..: 1 1 1 1 1 1 1 1 1 2 ...
 $ Tissue_Expression: chr [1:108] "Lymphoid" "Lymphoid" "Lymphoid" "Lymphoid" ...

> summary(filtered_lymph)
   cluster           condition          total cells     
 Length:108         Length:108         Min.   : 0.3861  
 Class :character   Class :character   1st Qu.: 2.6321  
 Mode  :character   Mode  :character   Median : 4.5800  
                                       Mean   : 6.0848  
                                       3rd Qu.: 7.2591  
                                       Max.   :23.2172  
   New_Condition Tissue_Expression 
 NTX      :36    Length:108        
 Anti-PD-1:32    Class :character  
 AC484    :40    Mode  :character 

I am trying to find a way of conducting a two tailed unpaired t-test between the control and test 'New_Condition' variables - this would be between 'NTX' and Anti-PD-1' and another between 'NTX' and 'AC484'. I want this to be calculated for each of the different cell clusters ie. 'CD4+ Tcells', 'Progenitor Tcells' etc.

I then want to be able to plot the p-values above the respective pairs of boxplots on my ggplot - the code for which is detailed below:

p_lymph <- ggplot(filtered_lymph, aes(x = cluster, y = `total cells`, fill = New_Condition, color = New_Condition)) +
  geom_boxplot(position = position_dodge(width = 0.8), size = 0.9,linetype="solid", outlier.shape=NA) +
  scale_fill_manual(values=fill_colors) + #These two lines I am changing the legend label from New_Condition (as it is named in the original data column) to Treatment as stated on the original paper
  scale_color_manual(name="Condition", values = outline_colors) +
  theme_minimal()+
  guides(fill="none",color="none")+ #removing the legend
  labs(y="Total cells (%)",
       x=NULL)+
  scale_x_discrete(labels=x_lym_labels)+
  theme(
    axis.line=element_line(color="black",linewidth=0.5,linetype="solid"), #adding axis lines
    axis.text.x=element_blank(),
    axis.ticks.x=element_line(colour="black",linewidth=0.5),#manually adding in tick breaks
    axis.ticks.y=element_line(colour="black", linewidth=0.5),
    panel.grid=element_blank() #removing grid lines
  )+
      scale_y_continuous(breaks=seq(0,25, by=5),
                         limits=c(0,25), #Adjusting the y-axis breaks
                         labels=c("0","5","10","15","20","25") #customising the y-axis labels
                        )

I have previously tried the t.test function:

t.test(filtered_lymph$'total cells',filtered_lymph$cluster[filtered_lymph$New_Condition])$p.value

however got this error

Error in if (stderr < 10 * .Machine$double.eps * max(abs(mx), abs(my))) stop("data are essentially constant") : 
  missing value where TRUE/FALSE needed
In addition: Warning messages:
1: In mean.default(y) : argument is not numeric or logical: returning NA
2: In var(y) : NAs introduced by coercion

Any help would be greatly appreciated!


Solution

  • The answer, as hinted in the comments is to use something like geom_signif from ggsignif, but you would have to change the structure of your plot to get this to work:

    library(ggplot2)
    library(ggsignif)
    
    ggplot(filtered_lymph, aes(New_Condition, `total cells`,
                               color = New_Condition, fill = after_scale(alpha(color, 0.5)))) +
      geom_boxplot() +
      geom_signif(comparisons = list(c('NTX', 'Anti-PD-1'),
                                     c('NTX', 'AC484')),
                  step_increase = 0.1, color = 'black',
                  map_signif_level = scales::pvalue_format(add_p = TRUE)) +
      scale_x_discrete('Cluster', expand = c(0.25, 0.5)) +
      scale_color_manual("Condition", values = c("#4d4b4c", "#4c517b", "#c63a41")) +
      facet_grid(.~cluster, switch = 'x') +
      theme_minimal() +
      theme(panel.spacing.x = unit(0, 'mm'),
            axis.text.x = element_blank(),
            axis.line = element_line(),
            strip.placement = 'outside') 
    

    enter image description here