I was trying to perform a normality distribution test on a series of specific variables of the dataset I'm working on and I've coded the followings commands
as.data.frame(df_demo) %>%
dplyr::select(years, trial_rt, choice_rt) %>%
tidyr::drop_na() %>%
apply(., 2, nortest::ad.test)
However, I do not know how to enter also a command for outline in one way for each variables I've selected the histograms with nomrality distribution with normality curve included. Specifically, the histogram code I'm instrested in should have the following hallmarks:
Taking the variable 'age'
windows(width = 7, height = 7)
par(lwd = 1, las = 1, family = "sans", cex = 1, mgp = c(3.0, 1, 0))
hist2(df_demo$years, freq = FALSE, main = "",
xlab = "years", ylab = "", col = "darkgray")
curve(dnorm(x,
mean = mean(df_demo$years, na.rm = TRUE),
sd = sd(df_demo$years, na.rm = TRUE)),
add = TRUE)
psych::skewness.kurtosis(df_demo$years)
ks.test(df_demo$years, "pnorm",
mean = mean(df_demo$years, na.rm = TRUE),
sd = sd(df_demo$years, na.rm = TRUE))
I'm just reporting here some of the observation of the dataset I'm working on:
head(df_demo, 50)
# A tibble: 50 × 20
participant sex years attention_time phase trial_key resp1 cond trial_rt choice_key choice_rt trial_id dots_L dots_R cue
<chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <int> <int> <chr>
1 PX001 F 22 60 left None 0 NR NA p 0.652 0 48 52 left
2 PX001 F 22 60 left None 0 NR NA p 0.585 1 48 52 left
3 PX001 F 22 60 left None 0 R NA p 0.652 2 52 48 left
4 PX001 F 22 60 left space 1 R 0.812 p 0.699 3 55 45 left
...
Has anyone any clue? How should I keep on the afprementioned code? Furthermore I would like that the graphs might plotted together in the same window, not overlapped.
Okay, now, having your data, we can do it again.
df=structure(list(ID = c("P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323"), gender = c("F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F"), age = c(19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19), fixation_time = c(60,
60, 60, 60, 60, 70, 50, 50, 50, 70, 70, 60, 50, 60, 70, 70, 50,
70, 70, 60, 70, 50, 50, 50, 60, 70, 60, 50, 60, 70, 60, 70, 50,
60, 70, 50, 50, 70, 70, 70, 70, 50, 60, 50, 60, 60, 70, 50, 60,
60), block = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), t1.key = c("None", "None",
"None", "space", "None", "space", "None", "None", "None", "space",
"None", "None", "space", "None", "None", "space", "None", "None",
"space", "None", "space", "space", "space", "None", "None", "None",
"space", "space", "None", "None", "space", "None", "None", "None",
"None", "None", "None", "space", "space", "None", "None", "None",
"None", "space", "None", "None", "space", "None", "space", "None"
), T1.response = c(0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0), COND = c("NR",
"NR", "NR", "R", "NR", "R", "NR", "NR", "NR", "R", "NR", "NR",
"R", "NR", "NR", "R", "NR", "NR", "R", "NR", "R", "R", "R", "NR",
"NR", "NR", "R", "R", "NR", "NR", "R", "NR", "NR", "NR", "NR",
"NR", "NR", "R", "R", "NR", "NR", "NR", "NR", "R", "NR", "NR",
"R", "NR", "R", "NR"), T1.rt = c(NA, NA, NA, 0.812299799988978,
NA, 0.72336569998879, NA, NA, NA, 0.772733500052709, NA, NA,
0.606754800013732, NA, NA, 0.601030899968464, NA, NA, 0.838272600027267,
NA, 0.305548300035298, 0.849945599969942, 0.748269900039304,
NA, NA, NA, 0.859215400007088, 0.95704890001798, NA, NA, 0.874362500035204,
NA, NA, NA, NA, NA, NA, 0.270455699996091, 0.75726039998699,
NA, NA, NA, NA, 0.762694000033662, NA, NA, 0.789715700026136,
NA, 0.90579859999707, NA), CR.key = c("p", "p", "p", "p", "p",
"p", "p", "p", "p", "p", "p", "p", "p", "p", "o", "p", "i", "i",
"h", "u", "i", "u", "o", "o", "p", "p", "p", "o", "p", "i", "o",
"p", "p", "p", "o", "o", "o", "p", "i", "p", "p", "o", "o", "i",
"i", "o", "o", "i", "i", "u"), CR.rt = c(0.651771800010465, 0.585048799985088,
0.652350199990906, 0.69888829998672, 1.01917029998731, 0.550036200031173,
0.0361186999944039, 0.568817299965303, 0.452191599993966, 0.514980700041633,
0.619590600021184, 0.719264700019266, 0.466181399999186, 0.45217840000987,
0.668881699966732, 0.914478300022893, 1.01910460001091, 1.40315000002738,
1.69993370003067, 1.71914210001705, 1.29938790004235, 0.698139799991623,
0.848338100011461, 0.651829700043891, 0.486136299965438, 0.703567499993369,
0.76673849998042, 0.54929809999885, 0.718664799991529, 0.768383099988569,
0.898415500007104, 0.819344500021543, 0.61898209998617, 0.737225699995179,
1.03654629999073, 0.971092400024645, 1.4362695000018, 0.999490200018045,
0.932840399967972, 0.586312200000975, 0.786785800009966, 1.01987839996582,
0.93673920002766, 0.715710600023158, 0.819960499997251, 0.75370900001144,
0.818668299994897, 0.903600800025742, 1.1176545000053, 1.10352450003847
), trial_num = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32,
33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49,
50, 51, 52, 53), ldots = c(48, 48, 52, 55, 51, 51, 52, 49, 45,
55, 49, 49, 51, 49, 48, 52, 45, 49, 45, 55, 51, 48, 55, 51, 45,
45, 52, 48, 48, 48, 55, 51, 49, 48, 49, 51, 51, 55, 51, 49, 45,
55, 51, 55, 55, 52, 52, 48, 49, 52), rdots = c(52, 52, 48, 45,
49, 49, 48, 51, 55, 45, 51, 51, 49, 51, 52, 48, 55, 51, 55, 45,
49, 52, 45, 49, 55, 55, 48, 52, 52, 52, 45, 49, 51, 52, 51, 49,
49, 45, 49, 51, 55, 45, 49, 45, 45, 48, 48, 52, 51, 48), TASK = c("left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left"), T1.correct = c(0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1), Go.Nogo..whether.a.person.should.respond. = c("NR",
"NR", "R", "R", "R", "R", "R", "NR", "NR", "R", "NR", "NR", "R",
"NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "R", "R",
"NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "NR", "NR",
"R", "R", "R", "R", "NR", "NR", "R", "R", "R", "R", "R", "R",
"NR", "NR", "R"), T1.ACC = c(1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0), CR = c(4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 2, 2, 9, 1, 2, 1,
3, 3, 4, 4, 4, 3, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4, 2, 4, 4, 3, 3,
2, 2, 3, 3, 2, 2, 1), difficulty = c("medium", "medium", "medium",
"easy", "hard", "hard", "medium", "hard", "easy", "easy", "hard",
"hard", "hard", "hard", "medium", "medium", "easy", "hard", "easy",
"easy", "hard", "medium", "easy", "hard", "easy", "easy", "medium",
"medium", "medium", "medium", "easy", "hard", "hard", "medium",
"hard", "hard", "hard", "easy", "hard", "hard", "easy", "easy",
"hard", "easy", "easy", "medium", "medium", "medium", "hard",
"medium")), row.names = c(NA, -50L), class = c("tbl_df", "tbl",
"data.frame"))
tibble
in tibble
)library(tidyverse)
df1 = df %>%
pivot_longer(where(is.numeric), names_to = "variable", values_to = "value") %>%
group_by(variable) %>%
nest()
output df1
# A tibble: 12 x 2
# Groups: variable [12]
variable data
<chr> <list>
1 age <tibble [50 x 9]>
2 fixation_time <tibble [50 x 9]>
3 block <tibble [50 x 9]>
4 T1.response <tibble [50 x 9]>
5 T1.rt <tibble [50 x 9]>
6 CR.rt <tibble [50 x 9]>
7 trial_num <tibble [50 x 9]>
8 ldots <tibble [50 x 9]>
9 rdots <tibble [50 x 9]>
10 T1.correct <tibble [50 x 9]>
11 T1.ACC <tibble [50 x 9]>
12 CR <tibble [50 x 9]>
What is such a tibble
in tibble
? Let's check.
df1$data[[6]] %>% select(ID, value)
output
# A tibble: 50 x 2
ID value
<chr> <dbl>
1 P1323 0.652
2 P1323 0.585
3 P1323 0.652
4 P1323 0.699
5 P1323 1.02
6 P1323 0.550
7 P1323 0.0361
8 P1323 0.569
9 P1323 0.452
10 P1323 0.515
# ... with 40 more rows
Well, since we have data in separate tibble
, let's create a function that returns the statistics we are interested in.
add.statistic = function(data) {
x = data$value[!is.na(data$value)]
suppressWarnings(tibble(
n = length(x),
nunique = length(unique(x)),
mean = mean(x),
sd = sd(x),
skew = ifelse(nunique>1,moments::skewness(x), NA),
kur = ifelse(nunique>1,moments::kurtosis(x), NA),
ks.p = ifelse(nunique>1,stats::ks.test(x, "pnorm", mean, sd)$p.value, NA),
ad.stat = ifelse(nunique>1,nortest::ad.test(x)$statistic, NA),
ad.p = ifelse(nunique>1,nortest::ad.test(x)$p.value, NA),
sw.stat = ifelse(nunique>1,stats::shapiro.test(x)$statistic, NA),
sw.p = ifelse(nunique>1,stats::shapiro.test(x)$p.value, NA)
))
}
A few comments are needed here. Variables can contain all the same values. Then counting statistics like skewness
, kurtosis
, shapiro
and the like is pointless. For this I added the ifelse
function and the nunique
variable there.
Furthermore, the test ks.test
may generate warnings. For this I used suppressWarnings
to silence it.
Now let's see how our add.statistic
function works
add.statistic(df1$data[[6]])
output
# A tibble: 1 x 11
n nunique mean sd skew kur ks.p ad.stat ad.p sw.stat sw.p
<int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 50 50 0.818 0.313 0.849 4.53 0.456 1.17 0.00421 0.928 0.00465
Bingo! We have statistics! Let's put it together now.
df1 = df %>%
pivot_longer(where(is.numeric), names_to = "variable", values_to = "value") %>%
group_by(variable) %>%
nest() %>%
mutate(stat = map(data, add.statistic))
df1 %>% unnest(stat)
output
# A tibble: 12 x 13
# Groups: variable [12]
variable data n nunique mean sd skew kur ks.p ad.stat ad.p sw.stat sw.p
<chr> <list> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 age <tibble [50 x 9]> 50 1 19 0 NA NA NA NA NA NA NA
2 fixation_time <tibble [50 x 9]> 50 3 60.4 8.07 -0.0719 1.57 0.0139 3.90 7.17e-10 0.799 8.36e- 7
3 block <tibble [50 x 9]> 50 1 0 0 NA NA NA NA NA NA NA
4 T1.response <tibble [50 x 9]> 50 2 0.34 0.479 0.676 1.46 0.0000000391 10.1 3.7 e-24 0.599 1.85e-10
5 T1.rt <tibble [50 x 9]> 17 17 0.731 0.191 -1.42 4.15 0.209 1.20 2.74e- 3 0.819 3.78e- 3
6 CR.rt <tibble [50 x 9]> 50 50 0.818 0.313 0.849 4.53 0.456 1.17 4.21e- 3 0.928 4.65e- 3
7 trial_num <tibble [50 x 9]> 50 50 26.5 16.0 -0.0204 1.78 0.854 0.582 1.23e- 1 0.952 4.28e- 2
8 ldots <tibble [50 x 9]> 50 6 50.2 3.05 0.0230 2.28 0.297 1.27 2.35e- 3 0.918 2.06e- 3
9 rdots <tibble [50 x 9]> 50 6 49.8 3.05 -0.0230 2.28 0.297 1.27 2.35e- 3 0.918 2.06e- 3
10 T1.correct <tibble [50 x 9]> 50 2 0.52 0.505 -0.0801 1.01 0.0000101 8.84 8.98e-22 0.636 6.99e-10
11 T1.ACC <tibble [50 x 9]> 50 2 0.66 0.479 -0.676 1.46 0.0000000391 10.1 3.7 e-24 0.599 1.85e-10
12 CR <tibble [50 x 9]> 50 5 3.32 1.25 1.39 9.82 0.00112 3.66 2.77e- 9 0.755 9.19e- 8
Everything works great. So let's create a function that generates the graph. However, only for variables for which nunique
> 1!
create.plot = function(df, group){
stat = df$stat[[1]]
data = df$data[[1]]
hist.val = hist(data$value, 30)
if(stat$nunique ==1) return(NULL)
statlab = stat %>%
pivot_longer(everything(), names_to = "stat", values_to = "val") %>%
mutate(x=hist.val$breaks[2],
y=seq(max(hist.val$density)*0.1,
max(hist.val$density)*0.7,
length.out=length(stat)),
lab = paste(stat,":",signif(val,3)))
data %>% ggplot(aes(value))+
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="red", col="red")+
stat_function(fun = dnorm, args = list(mean = stat$mean, sd = stat$sd),
col="blue")+
geom_label(data=statlab, aes(x=x, y=y, label = lab))+
xlab(group)
}
Let's check how our create.plot
function works on the selected variable
create.plot(df1[6,], df1[6,1])
I don't know if you exactly expected it, but I assume you might like such a plot.
Now all we have to do is combine everything together in one neat command
df1 %>% group_by(variable) %>%
group_map(create.plot)