rggpubrhypothesis-test

hypothesis test to compare the means in groups


I have the following dataset:

totalamount <- structure(list(severity = c(3.25018430323761, 8.21019552826707, 10.2345631283889, 3.97861473573608, 2.49151725256104, 5.52900361815363, 55.2071093946671, 2.32037933918783, 3.70223446009639, 2.54604033976289, 6.11158768002453, 6.77828584824538, 2.95234684891201, 3.42869781905917, 1.42585469193667, 16.0747484046982, 34.6790705177556, 6.35006508638326, 21.3835553934895, 8.05182596903091, 5.51058496865505, 6.62574677139218, 18.5431647736137, 8.42472199298749, 3.36375810388373, 1.97372841937075, 8.31896832645395,  4.65269327269977, 6.22991152661561, 15.8544718751783, 1.90660480387344, 5.08052016589429, 20.2338335298909, 8.50945042127277, 1.51414101802446, 26.2729903771137, 5.53077768848019, 12.8038148444239, 3.12234414248935, 12.6786863038133, 26.2319370867102, 2.61240179627255, 8.23388110738683, 14.3826203214327, 11.595535364264, 97.6518578506433, 3.88736013960141, 40.0522894434289, 13.0286722381151, 13.8311206454621, 2.32138683407293, 16.2987548824861, 6.48548721511325, 18.3424463342478, 2.13908320122601,  20.6094599194298, 15.0090654547867, 20.8399356559801, 14.820113077459, 7.7047427230498, 4.39456178522343, 10.000704907942, 3.83333333333335, 11.4318867413853, 7.89010985916763, 21.1525423728813, 6.68434449076974, 30.7879775228089, 3.50698443368264, 1.57333784525088, 3.42869781905917, 6.61297948804958, 3.62752331453914, 1.97691919770155, 6.00848130742765, 60.5423027993827, 3.89628569263605, 3.49144306748366, 24.3750905239606, 11.595535364264, 7.70054828336092, 18.6975666270077, 2.09556927634091, 15.3288687440042,  7.24716796005598, 8.71416860168375, 8.80379237078557, 3.23719672489693, 2.6867931761598, 4.56424081767021, 2.21797125308475, 10.2054909549981, 3.2998707122004, 3.63083883463682, 5.4154303099926, 0.989943258036202, 26.0323383549274, 24.0264313596043, 1.27079897767722, 6.9055733274643, 6.01734276046511, 4.52444479739508, 5.18185818729858, 3.51960556662747, 5.63007253187119, 1.15126025566876, 18.0777099933802, 17.8171568613498, 6.30910836452448, 10.1390199646202, 5.46720789634529, 20.2819827606193,  3.62938031125651, 1.33160475087464, 8.75280601928286, 9.10988864573485, 13.1488401777766, 13.7673431302572, 21.0409114970367), prod = c(507.755867714392, 1705.49880158972, 3640.34113040821, 858.239175113401, 238.706492667227, 1244.6940148014, 3977.72626741407, 339.042388721234, 923.458086400708, 419.370659017813, 1158.70861783272, 1179.59095902181, 295.599071534232, 398.278208742789, 194.798500739898, 4650.15567576958, 6329.11241677931, 1028.0570456891, 4925.91370333093, 2022.3511913243, 783.15963920042,  716.208920204282, 2381.76958822984, 1254.23069689481, 778.9043298375, 161.157156502681, 1459.13900370654, 940.233384921433, 1198.83896632231, 3565.26547361682, 395.837386951839, 889.871919575969, 6996.9192914905, 2001.8432222017, 144.079015010842, 4547.94609766363, 1041.17426542942, 1515.78182421578, 448.256010275335, 3227.88109312385, 7761.31628488419, 250.126433743004, 983.315745749984, 3694.51237283626, 2512.9776843891, 7310.96143332019, 790.498601521711, 3996.2587837501, 1651.70598778014, 2804.27004030374,  307.274181420005, 2133.64570615031, 1381.62822639232, 3751.4044378629, 485.552835444503, 2465.6874345525, 3025.91221835444, 5158.1742720822, 2574.98910483359, 1273.35710229232, 598.036156054948, 1643.50834356176, 319.378521699767, 2181.69261318497, 1837.09807672159, 1995.27654536416, 1095.2915743086, 2019.78842929518, 493.82769157334, 154.231798231093, 398.278208742789, 1306.25665858632, 393.003321145881, 244.412035268987, 1215.07197775974, 5575.22264019978, 81.8997236216959, 804.215278351321, 3572.29089339365,  2512.9776843891, 1187.64667079267, 3881.32939729115, 281.095890630284, 1976.75638215586, 1972.34564060146, 2108.31162543769, 2531.23741386524, 727.63049151556, 421.413414189908, 885.516102680844, 299.807327188995, 1762.96442932355, 752.831017112768, 793.224382445508, 390.715569543075, 112.899610829257, 6219.34258924352, 3237.6574058879, 76.4770620527288, 1223.88524456515, 1344.24723795186, 790.08886120287, 753.308224886514, 495.744119100497, 1405.79935366988, 229.928716828958, 2151.99773055987, 1569.44091576891,  1030.48482889462, 1980.07816897896, 1133.82514266977, 2639.74185437294, 425.245731957402, 168.746508943713, 1094.91478503367, 1041.10131771391, 1505.42904059794, 952.396081074004, 1863.20884404754), lesion_level = structure(c(3L, 3L, 2L, 4L, 4L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 3L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 4L, 4L, 2L, 3L, 4L, 3L, 3L, 1L, 2L, 4L, 1L, 4L, 4L, 2L, 3L, 4L, 3L, 1L, 1L, 3L, 3L, 1L, 1L, 4L, 2L, 2L, 3L, 2L, 3L, 4L, 4L, 1L, 2L, 3L, 1L, 1L, 2L, 4L, 4L, 4L, 4L, 1L, 2L, 4L, 1L, 3L, 2L,  4L, 4L, 3L, 1L, 3L, 2L, 4L, 4L, 2L, 1L, 1L, 1L, 1L, 4L, 4L, 2L, 1L, 2L, 2L, 3L, 2L, 3L, 4L, 2L, 2L, 3L, 2L, 1L, 3L, 4L, 2L, 1L, 3L, 3L, 4L, 2L, 2L, 4L, 2L, 3L, 1L, 1L, 4L, 4L, 3L, 4L, 3L, 4L, 2L, 4L), levels = c("C1-C4", "C5-C8", "Paraplegia", "ASIA D"), class = "factor"), sex = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,  1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L), levels = c("Male", "Female"), class = "factor"), SCIM = c(22, 2, 0, 8, 50, 17, 8, 64, 8, 17, 16, 13, 70, 62, 34, 12, 10, 29, 0, 3, 26, 6, 20, 18, 13, 70, 19, 27, 15, 11, 28, 15, 0, 6, 51, 15, 18, 14, 48, 11, 3, 14, 13, 18, 2, 10, 15, 8,  16, 9, 45, 19, 4, 8, 22, 22, 19, 15, 10, 23, 54, 14, 55, 21, 8, 70, 33, 21, 59, 49, 62, 27, 58, 40, 19, 0, 95, 16, 11, 2, 9, 7, 33, 22, 18, 17, 11, 15, 24, 27, 34, 8, 14, 12, 16, 21, 8, 14, 89, 42, 17, 15, 26, 29, 9, 14, 18, 4, 14, 9, 22, 12, 32, 46, 48, 35, 30, 27, 40)), row.names = c("1", "2", "3", "4", "5", "6", "7", "9", "10", "11", "12", "13", "14", "15", "17", "18", "19", "21", "29", "32", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "51", "52",  "53", "54", "55", "57", "58", "60", "61", "62", "66", "68", "70", "71", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "101", "102", "103", "104", "105", "106", "107", "108", "109", "110", "111", "112", "113", "114", "115", "116", "117", "118", "119", "120", "121", "122", "123", "124", "125", "126", "127", "128", "129", "130", "131", "132", "133", "134", "135", "136", "137", "138", "139",  "140", "141", "142", "143", "144"), class = "data.frame")

I have generated the following plot:

The plot was generated using the following code:

ggplot(totalamount, aes(x = lesion_level, y = severity, color = sex, fill = sex)) +
  geom_boxplot(alpha = 0.3) +
  geom_smooth(aes(group = sex), alpha = 0.05) +
  scale_color_manual(values = c("Male"=colores[1], "Female"=colores[4])) +
  scale_fill_manual(values = c("Male"=colores[1], "Female"=colores[4])) +
  guides(fill = guide_legend(title = "Gender"), color = guide_legend(title = "Gender")) +
  ylab(label = "name_serv") +
  xlab(label = "") +
  theme(axis.title.y = element_text(size = 8))

enter image description here

But, I would like to compare the means of every group, for example in the two bars with C1-C4 to include a p value = 0.2 (for example) because it seems that the means of the two groups are similar. I would like to include a p value for each group, I mean C5-C8, Paraplegia and ASIA D.

I found the library ggpubr really healpfull for this task. But I can not get the desired output.

For example to create something similar the code is the following:

ggboxplot(totalamount, x="lesion_level", y="severity",
          color="sex", palette="jco", add="jitter", bxp.errorbar = TRUE) +
  stat_compare_means()  

enter image description here

But in this case I am not clear what is the comparison considered in this p-value which group is being compared? how to include 4 p values to analyze the comparison between each group?


Solution

  • kruskal.test only does a simple one-deep comparison. If you're trying to calculate the p-value between sexes for each of your lesion_level, then I suggest we calculate that first.

    I'll demo in dplyr, though mostly for convenience, other summarize-by-group options exist.

    library(dplyr)
    res <- group_by(totalamount, lesion_level) %>%
      summarize(pv = kruskal.test(severity ~ sex, data = cur_data())$p.value) %>%
      mutate(
        lbl = formatC(signif(pv, digits=3), digits=3, format="fg", flag="#"),
        lbl = paste0(if_else(lesion_level == "C1-C4", "Kruskal-Wallis\np = ", "\np = "),
                     lbl)
      )
    res
    # # A tibble: 4 × 3
    #   lesion_level    pv lbl                        
    #   <fct>        <dbl> <chr>                      
    # 1 C1-C4        0.867 "Kruskal-Wallis\np = 0.867"
    # 2 C5-C8        0.487 "\np = 0.487"              
    # 3 Paraplegia   0.721 "\np = 0.721"              
    # 4 ASIA D       0.320 "\np = 0.320"              
    

    I used the formatC(signif(..), ..) to produce a simple 3-significant-digit display robust across different orders of magnitude (borrowed from https://stackoverflow.com/a/3443955/3358272). While it does little here, if you had one group where the p-value was near 0.000237 then you could see the difference:

    formatC(signif(c(res$pv, 0.000273), digits=3), digits=3, format="fg", flag="#")
    # [1] "0.867"    "0.487"    "0.721"    "0.320"    "0.000273"
    

    This allows us to plot it as:

    gg <- ggplot(totalamount, aes(x = lesion_level, y = severity, color = sex, fill = sex)) +
      geom_boxplot(alpha = 0.3) +
      geom_smooth(aes(group = sex), alpha = 0.05) +
      geom_text(aes(x = lesion_level, label = lbl), y = Inf, vjust = 2,
                data = res, inherit.aes = FALSE) +
      # scale_color_manual(values = c("Male"=colores[1], "Female"=colores[4])) +
      # scale_fill_manual(values = c("Male"=colores[1], "Female"=colores[4])) +
      guides(fill = guide_legend(title = "Gender"), color = guide_legend(title = "Gender")) +
      ylab(label = "name_serv") +
      xlab(label = "") +
      theme(axis.title.y = element_text(size = 8))
    suppressWarnings(print(gg))
    

    (I commented out the scale_* expressions since we don't have colores.)

    ggplot with Kruskal-Wallis p-values over each pair of boxplots

    The aesthetics of my additions can easily be improved, but I think the premise should be clear.