rggplot2boxplotoutliersviolin-plot

Ignore outliers in box-violin plot in ggplot2


I'm trying to plot a box-violin plot in ggplot2 but I can't seem to find a way to ignore outliers in geom_violin which in geom_boxplot is taken care of by outlier.shape = NA. As a result the tails of the violin extend all the way to the top of the y-axis.

Here's my data:

> dput(data)
structure(list(Group = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B"), Type = c("1", 
"1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "2"), Value = c(1245.2261, 2886.96, 3572.6615, 
2011.1111, 3321.2025, 229.5533, 14.1449, 135.291, 54.4526, 36.0926, 
74.5434, 86.335, 131.4279, 105.4935, 14.5906, 1.503, 2.7716, 
42.381, 88.9701, 869.6742, 316.855, 32.9683, 6.4267, 52.2946, 
164.4073, 54.2387, 37.5134, 71.2792, 145.958, 114.6187, 36.133, 
1.8108, 67.9746, 39.8386, 382.5043, 40.1728, 37.1252, 288.6866, 
25.085, 21.8553, 15.0067, 143.8127, 16.8865, 26.8421, 8.8349, 
188.1872, 42.2323, 64.2163, 56.7453, 85.4888, 29.6905, 6.1148, 
43.0328, 158.0811, 90.4613, 217.033, 111.5344, 271.5655, 195.7022, 
79.7093, 6.0458, 116.6274, 43.6644, 72.4189, 89.9063, 37.6572, 
294.5133, 46.8855, 16.7959, 50.4155, 39.6882, 18.7457, 12.728, 
40.2756, 129.6219, 190.0905, 796.7611, 30.1724, 14.8736, 551.2666, 
18.2315, 57.9076, 129.7094, 158.1109, 256.6553, 79.6724, 75.2056, 
7.2661, 18.7643, 79.4748, 445.5713, 9.9553, 106.6388, 50.0596, 
56.4002, 157.1143, 9.805, 117.2691, 8.9047, 3.6258, 387.132, 
56.8996, 40.7247, 1117.4439, 79.4742, 224.0688, 134.8485, 8.4794, 
23.1996, 65.2439, 389.3144, 294.4159, 671.4736, 541.8969, 64.3243, 
25.0634, 7.727, 20.8132, 149.3634, 160.7447, 114.1869, 38.4615, 
28.502, 34.0532, 15.0038, 1028.626, 166.3813, 24.7788, 306.6516, 
204.0348, 18.1818, 77.4041, 24.1017, 96.4706, 59.4937, 23.2078, 
3.192, 37.8065, 40.8055, 8.3577, 7.4273, 66.426, 1548.8338, 3.6242, 
92.264, 42.8195, 282.1101, 104.0848, 42.5784, 9.9258, 63.8066, 
99.6852, 26.5864, 270.322, 121.4097, 32.6258, 287.2582, 7.4627, 
851.5289, 156.0563, 324.1189, 101.5936, 5.618, 114.3788, 54.6875, 
96.5594, 446.1059, 95.1883, 30.3678, 48.2655, 61.4182, 66.5381, 
4.0973, 8.1744, 2.7192, 0.3697, 0.3681, 0.7488, 0, 7.9272, 1.1391, 
1.4375, 0.7535, 0.8256, 1.0323, 0.9053, 2.7822, 0.6899, 3.037, 
2.423, 0.7045, 6.1298, 1.7498, 10.5565, 0.684, 2.1433, 1.5334, 
1.7043, 1.3783, 0.6146, 8.9179, 1.3879, 4.2004, 2.0747, 0.3508, 
4.4362, 0.7214, 1.2232, 4.1245, 17.8295, 240.18, 61.0013, 0.813, 
69.9786, 0.4346, 1.624, 30.4569, 4.4143, 5.3119, 0.4459, 0, 1.1484, 
3.7614, 2.863, 1452.5581, 3.7736, 1.7705, 10.6081, 2743.5433, 
6.019, 0.4851, 2.4719, 7.5529, 0, 6.9739, 1.5783, 0, 1.3115, 
2.7701, 2.6135, 0, 0.9915, 4.0413, 2.3496, 1.796, 0.8745, 5.6391, 
0.2803, 3202.3684, 19.5453, 17.5439, 1.831, 4.1848, 0.7547, 0, 
0.5253, 0, 38.07, 0.6656, 5.6184, 0, 1.6858, 0.4801, 0.6676, 
3.0412)), row.names = c(NA, -265L), class = c("tbl_df", "tbl", 
"data.frame"))

and the code:

ggplot(data = data,
   aes(x = Group,
       y = Value,
       fill = Type)) +
  geom_violin(width = 0.5,
              scale = "width",
              color = "black",
              show.legend = FALSE) +
  geom_boxplot(position = position_dodge(width = 0.5),
               width = 0.1,
               color = "black",
               lwd = 0.5,
               outlier.shape = NA,
               show.legend = TRUE) +
  scale_fill_manual(name = "Type",
                    breaks = c("1", "2"),
                    values = c("1" = "red",
                               "2" = "forestgreen")) +
  stat_compare_means(aes(group = Type),
                     method = "wilcox.test",
                     label = "p.signif",
                     vjust = 0.5,
                     size = 10) +
  scale_y_continuous(breaks = pretty(data$Value, n = 8)) +
  xlab("Group") +
  ylab("Value") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_rect(color = "black"),
        panel.background = element_blank(),
        axis.ticks = element_line(color = "black"),
        axis.text.x = element_text(size = 12,
                                   color = "black",
                                   face = "bold",
                                   vjust = 0.5),
        axis.text.y = element_text(size = 12,
                                   color = "black",
                                   face = "bold"),
        axis.title = element_text(size = 15,
                                  face = "bold"),
        legend.title = element_text(size = 15,
                                    face = "bold")) +
  guides(fill = guide_legend(title = "Type",
                             override.aes = list(size = 10)))

I tried this but it doesn't work for me. I would like to have the tails of the violin restricted to the two ends (minimum/maximum values) of the boxplot.


Solution

  • This is an option. Use the original Value column for the boxplot & create a new column for the violin with all outliers set to NA. Remember you can always mutate your data to fit your use case!

    library(dplyr)
    library(ggplot2)
    library(ggpubr)
    data <- structure(...)
    
    # helper function
    replace_outliers <- function(x) {
        Q1 <- quantile(x, 0.25)
        Q3 <- quantile(x, 0.75)
        IQR <- Q3 - Q1
        x[(x < Q1 - 1.5 * IQR) | (x > Q3 + 1.5 * IQR)] <- NA
        x
    }
    
    data %>%
        group_by(Group, Type) %>%
        mutate(Value_NA = replace_outliers(Value)) %>% # replace outliers with NA
        ggplot(aes(x = Group,
            fill = Type)) +
        geom_violin(aes(y = Value_NA),
            width = 0.5,
            scale = "width",
            color = "black",
            show.legend = FALSE) +
        geom_boxplot(aes(y = Value),
            position = position_dodge(width = 0.5),
            width = 0.1,
            color = "black",
            lwd = 0.5,
            outlier.shape = NA,
            show.legend = TRUE) +
        scale_fill_manual(name = "Type",
            breaks = c("1", "2"),
            values = c("1" = "red",
                "2" = "forestgreen")) +
        stat_compare_means(aes(y = Value, group = Type),
            method = "wilcox.test",
            label = "p.signif",
            vjust = 0.5,
            size = 10) +
        scale_y_continuous(breaks = pretty(data$Value, n = 8)) +
        xlab("Group") +
        ylab("Value") +
        theme_bw() +
        theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            panel.border = element_rect(color = "black"),
            panel.background = element_blank(),
            axis.ticks = element_line(color = "black"),
            axis.text.x = element_text(size = 12,
                color = "black",
                face = "bold",
                vjust = 0.5),
            axis.text.y = element_text(size = 12,
                color = "black",
                face = "bold"),
            axis.title = element_text(size = 15,
                face = "bold"),
            legend.title = element_text(size = 15,
                face = "bold")) +
        guides(fill = guide_legend(title = "Type",
            override.aes = list(size = 10)))
    

    Created on 2024-05-14 with reprex v2.1.0