I'm a beginner on R Studio and writing my thesis, I would like to run a wilcoxon test for 5 variables dispatched in 2 groups, a full sample and a control sample, and to avoid creating 5 different dataframes I want to run it in one command.
The dataframe is a follow:
structure(list(Sample = c("Full", "Full", "Full", "Full", "Full",
"Full", "Full", "Full", "Full", "Full", "Full", "Full", "Full",
"Full", "Full", "Full", "Full", "Full", "Full", "Full", "Control",
"Control", "Control", "Control", "Control", "Control", "Control",
"Control", "Control", "Control", "Control", "Control", "Control",
"Control", "Control", "Control", "Control", "Control", "Control",
"Control"), EBITDA = c(7027, 9521, 926, 7085, 739, 3055, 1084,
7051, 90.2, 1863, 3567, 1514, 15212, 921.9, 15222, 336, 128.52,
21.93, 118.09, 1019.4, 4732.8, 17776, 940.45, 5265, 659.3, 4590.1,
1014.9, 3198, 72.51, 2083, 4277, 433.29, 9653, 628, 7706, 108,
118.07, 98.19, 52.39, 1138), SALES = c(30638, 25636, 6100, 25392,
10321, 24114, 10573, 44152, 700.68, 11588, 40682, 39434, 71203,
1233.9, 45503, 3553, 1198.01, 120.1, 349.62, 4460.4, 22532, 42384,
5295.86, 43800, 10704.4, 30427.8, 9051.2, 35555, 857.08, 14125,
60613, 3302.25, 66639, 1583, 28878, 3967, 1216.73, 819.28, 327.41,
5065), Net.Income = c(5648, 2363, 29, 2603, 9, 748, 580, 2983,
26.84, 324, 2669, 99, 2269, 214.1, 3810, -675, 78.17, 11.56,
30.25, 530.5, 2767.5, 5768, 401.55, 1378, 143, 1461, 265.2, 1311,
18.46, 856, 885, 281.22, -561, 313, 2198, -446, 55.6, 34.5, 29.15,
593), Total.Assets = c(53362, 1181372, 8000, 56666, 16175, 30690,
11459, 75357, 1071.75, 18649, 68198, 72137, 281640, 7279, 94276,
6482, 776.36, 121.9, 1198.02, 6776.6, 32063, 1965283, 9194.81,
43395, 7871.5, 49544.5, 9038.8, 37694, 643.75, 18135, 68975,
8427.42, 158499, 10323, 58993, 3539, 1031.06, 717.73, 568.18,
7577), ROA = c(0.105843109, 0.002000217, 0.003625, 0.045935835,
0.000556414, 0.02437276, 0.050615237, 0.039584909, 0.025043107,
0.017373586, 0.039136045, 0.001372389, 0.008056384, 0.029413381,
0.040413255, -0.104134526, 0.10068486, 0.094845124, 0.025249098,
0.078284095, 0.086314443, 0.002934946, 0.043671054, 0.03175481,
0.018166804, 0.029488642, 0.029340178, 0.034780071, 0.028681942,
0.047201544, 0.012830736, 0.03337013, -0.003539455, 0.030320643,
0.037258658, -0.126024301, 0.053925296, 0.048068282, 0.051307594,
0.078263165)), class = "data.frame", row.names = c(NA, -40L))
I tried the solution proposed here with the wilcox_test
function by rstatix but it only gives one p-value as the output and I would like the test score and the p-values for each of the variables.
Any help will be highly appreciated,
Thanks in advance !
Here is an example from my similar question (Apply statistical test to many variables: improve speed) tweaked to suit your use-case:
df <- structure(list(Sample = c("ISO 14001", "ISO 14001", "ISO 14001",
"ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001",
"ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001",
"ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001",
"ISO 14001", "ISO 14001", "Control Sample", "Control Sample",
"Control Sample", "Control Sample", "Control Sample", "Control Sample",
"Control Sample", "Control Sample", "Control Sample", "Control Sample",
"Control Sample", "Control Sample", "Control Sample", "Control Sample",
"Control Sample", "Control Sample", "Control Sample", "Control Sample",
"Control Sample", "Control Sample"),
EBITDA = c(7027, 9521, 926,
7085, 739, 3055, 1084, 7051, 90.2, 1863, 3567, 1514, 15212, 921.9,
15222, 336, 128.52, 21.93, 118.09, 1019.4, 4732.8, 17776, 940.45,
5265, 659.3, 4590.1, 1014.9, 3198, 72.51, 2083, 4277, 433.29,
9653, 628, 7706, 108, 118.07, 98.19, 52.39, 1138),
SALES = c(30638,
25636, 6100, 25392, 10321, 24114, 10573, 44152, 700.68, 11588,
40682, 39434, 71203, 1233.9, 45503, 3553, 1198.01, 120.1, 349.62,
4460.4, 22532, 42384, 5295.86, 43800, 10704.4, 30427.8, 9051.2,
35555, 857.08, 14125, 60613, 3302.25, 66639, 1583, 28878, 3967,
1216.73, 819.28, 327.41, 5065),
Net.Income = c(5648, 2363, 29,
2603, 9, 748, 580, 2983, 26.84, 324, 2669, 99, 2269, 214.1, 3810,
-675, 78.17, 11.56, 30.25, 530.5, 2767.5, 5768, 401.55, 1378,
143, 1461, 265.2, 1311, 18.46, 856, 885, 281.22, -561, 313, 2198,
-446, 55.6, 34.5, 29.15, 593),
Total.Assets = c(53362, 1181372,
8000, 56666, 16175, 30690, 11459, 75357, 1071.75, 18649, 68198,
72137, 281640, 7279, 94276, 6482, 776.36, 121.9, 1198.02, 6776.6,
32063, 1965283, 9194.81, 43395, 7871.5, 49544.5, 9038.8, 37694,
643.75, 18135, 68975, 8427.42, 158499, 10323, 58993, 3539, 1031.06,
717.73, 568.18, 7577),
ROA = c(0.105843109, 0.002000217, 0.003625,
0.045935835, 0.000556414, 0.02437276, 0.050615237, 0.039584909,
0.025043107, 0.017373586, 0.039136045, 0.001372389, 0.008056384,
0.029413381, 0.040413255, -0.104134526, 0.10068486, 0.094845124,
0.025249098, 0.078284095, 0.086314443, 0.002934946, 0.043671054,
0.03175481, 0.018166804, 0.029488642, 0.029340178, 0.034780071,
0.028681942, 0.047201544, 0.012830736, 0.03337013, -0.003539455,
0.030320643, 0.037258658, -0.126024301, 0.053925296, 0.048068282,
0.051307594, 0.078263165)),
class = "data.frame", row.names = c(NA, -40L)
)
Map_func_pvalues <- function(dataset) {
tmp <- split(dataset, dataset$Sample)
stack(Map(function(x, y) wilcox.test(x, y, exact = FALSE)$p.value, tmp[[1]][-1], tmp[[2]][-1]))
}
p_values <- Map_func_pvalues(df)
Map_func_statistic <- function(dataset) {
tmp <- split(dataset, dataset$Sample)
stack(Map(function(x, y) wilcox.test(x, y, exact = FALSE)$statistic, tmp[[1]][-1], tmp[[2]][-1]))
}
test_scores <- Map_func_statistic(df)
merge(p_values, test_scores, by = "ind", suffix = c("_pvalue", "_wilcox_statistic"))
#> ind values_pvalue values_wilcox_statistic
#> 1 EBITDA 0.6948910 185
#> 2 Net.Income 0.8817308 194
#> 3 ROA 0.7352682 213
#> 4 SALES 0.9245727 196
#> 5 Total.Assets 0.5978626 180
Created on 2022-11-15 by the reprex package (v2.0.1)
Does that solve your problem?