rggplot2plotggboxplot

Something odd is happening with my boxplot. The pvalue line dispayed from stat_pvalue_manual() with ggboxplot() is running off the graph


Something odd is happening with my box plot. The pvalue line dispayed from stat_pvalue_manual() with ggboxplot() is running off the graph.

I created the box plot below from the same code but different data and it displays correctly

box plot displaying correctly

But when I run the same code with a different dataset, the significance value line runs off the graphs.

Here is my code:




library(tidyverse)
library(ggpubr)
library(rstatix)


#source data

t1c2_.5_all<-structure(list(Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L), levels = c("T1", "C1", "C2", "C3"), class = "factor"), Year = c(2004, 
2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 
2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2004, 2005, 2006, 
2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 
2018, 2019, 2020, 2021, 2022, 2023), buff_dist = c(0.5, 0.5, 
0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 
0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 
0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), 
    NDVI_Avg3 = c(NA, 6395.874774, 6567.268516, 6625.956351, 
    6396.39154166667, 6329.32849466667, 6327.09833166667, 6431.656085, 
    6376.82626833333, 6399.35978466667, 6457.27028166667, 6530.29056866667, 
    6504.67240266667, 6665.66358833333, 6590.31482566667, 6680.14639133333, 
    6509.14329666667, 6504.518513, 6448.14020766667, NA, NA, 
    4509.88554533333, 4563.733277, 4755.027259, 4346.68286633333, 
    4471.81173166667, 4241.25849033333, 4387.160754, 4318.91640866667, 
    4412.97042333333, 4442.207563, 4346.34542466667, 4386.22608033333, 
    4727.431841, 4709.10905166667, 4936.07561766667, 4694.84953633333, 
    4616.632718, 4293.052728, NA)), row.names = c(NA, -40L), class = c("tbl_df", 
"tbl", "data.frame"))

#pairwise testing 
t1c2_.5_all.pw <- t1c2_.5_all %>% pairwise_t_test(NDVI_Avg3 ~ Type, p.adjust.method = "bonferroni")
t1c2_.5_all.pw


#add the pvalues to the graph by plotting its position on the x axis 
t1c2_.5_all.pw <- t1c2_.5_all.pw %>% add_xy_position(x = "Type")
t1c2_.5_all.pw
####the x="" must be the same as the x="" in the box plot to work 

#the box plot
t1c2_.5_all.box <- ggboxplot(t1c2_.5_all, x = "Type", y = "NDVI_Avg3") +
  ggtitle(label = "PWP Watersheds (all.5yr) - All Years ")+
  stat_pvalue_manual(t1c2_.5_all.pw , label = "p.adj.signif", tip.length = 0, step.increase = 0.1) +
  labs(
    subtitle = get_test_label(t1c2_.5_all.pw, detailed = TRUE),
    caption = get_pwc_label(t1c2_.5_all.pw)
  ) 
#label = "p.adj" would provide the values
#hide.ns = TRUE would hide values that are not significant

t1c2_.5_all.box
ggsave("t1c2_.5km_allyears_boxplot.png", width=6, height=5)

But when I run this, I nearly have what I want to display. boxplot not displaying correctly

Is there a parameter I need to include in the stat_pvalue_manual() section of the ggboxplot()?

I have used this code in the past to also compare multiple box plots and it worked like a charm! Just struggling with the comparison of some pairs of boxplots

enter image description here

Thank you in advanced for your help!


Solution

  • The problem is caused by the input data having extra levels for Type; if you remove the extra levels (i.e. 'refactor' Type using mutate(Type = factor(Type))) the bar is as expected, e.g.

    library(tidyverse)
    library(ggpubr)
    library(rstatix)
    #> 
    #> Attaching package: 'rstatix'
    #> The following object is masked from 'package:stats':
    #> 
    #>     filter
    
    t1c2_.5_all<-structure(list(Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
                                                   1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 
                                                   3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
                                                   3L), levels = c("T1", "C1", "C2", "C3"), class = "factor"), Year = c(2004, 
                                                                                                                        2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 
                                                                                                                        2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2004, 2005, 2006, 
                                                                                                                        2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 
                                                                                                                        2018, 2019, 2020, 2021, 2022, 2023), buff_dist = c(0.5, 0.5, 
                                                                                                                                                                           0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 
                                                                                                                                                                           0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 
                                                                                                                                                                           0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), 
                                NDVI_Avg3 = c(NA, 6395.874774, 6567.268516, 6625.956351, 
                                              6396.39154166667, 6329.32849466667, 6327.09833166667, 6431.656085, 
                                              6376.82626833333, 6399.35978466667, 6457.27028166667, 6530.29056866667, 
                                              6504.67240266667, 6665.66358833333, 6590.31482566667, 6680.14639133333, 
                                              6509.14329666667, 6504.518513, 6448.14020766667, NA, NA, 
                                              4509.88554533333, 4563.733277, 4755.027259, 4346.68286633333, 
                                              4471.81173166667, 4241.25849033333, 4387.160754, 4318.91640866667, 
                                              4412.97042333333, 4442.207563, 4346.34542466667, 4386.22608033333, 
                                              4727.431841, 4709.10905166667, 4936.07561766667, 4694.84953633333, 
                                              4616.632718, 4293.052728, NA)), row.names = c(NA, -40L), class = c("tbl_df", 
                                                                                                                 "tbl", "data.frame"))
    
    # Type has two different values, but four levels
    unique(t1c2_.5_all$Type)
    #> [1] T1 C2
    #> Levels: T1 C1 C2 C3
    
    # Fix Type then conduct pairwise testing 
    t1c2_.5_all.pw <- t1c2_.5_all %>%
      mutate(Type = factor(Type)) %>%
      pairwise_t_test(NDVI_Avg3 ~ Type, p.adjust.method = "bonferroni")
    t1c2_.5_all.pw
    #> # A tibble: 1 × 9
    #>   .y.       group1 group2    n1    n2        p p.signif    p.adj p.adj.signif
    #> * <chr>     <chr>  <chr>  <int> <int>    <dbl> <chr>       <dbl> <chr>       
    #> 1 NDVI_Avg3 T1     C2        20    20 2.14e-29 ****     2.14e-29 ****
    
    #add the pvalues to the graph by plotting its position on the x axis 
    t1c2_.5_all.pw <- t1c2_.5_all.pw %>% add_xy_position(x = "Type")
    
    #the box plot
    t1c2_.5_all.box <- ggboxplot(t1c2_.5_all, x = "Type", y = "NDVI_Avg3") +
      ggtitle(label = "PWP Watersheds (all.5yr) - All Years ")+
      stat_pvalue_manual(t1c2_.5_all.pw, label = "p.adj.signif",
                         tip.length = 0, step.increase = 0.1) +
      labs(
        subtitle = get_test_label(t1c2_.5_all.pw, detailed = TRUE),
        caption = get_pwc_label(t1c2_.5_all.pw)
      )
    #label = "p.adj" would provide the values
    #hide.ns = TRUE would hide values that are not significant
    
    t1c2_.5_all.box
    #> Warning: Removed 4 rows containing non-finite outside the scale range
    #> (`stat_boxplot()`).
    

    ggsave("t1c2_.5km_allyears_boxplot.png", width=6, height=5)
    

    Created on 2024-12-13 with reprex v2.1.0