How to outline in one way for each variables selected a graph with normality distribution

I was trying to perform a normality distribution test on a series of specific variables of the dataset I'm working on and I've coded the followings commands

as.data.frame(d) %>% 
      dplyr::select(
        age, T1.rt, CR.rt) %>% 
      na.omit() %>% 
      apply(., 2, ad.test)

However, I do not know how to enter also a command for outline in one way for each variables I've selected the histograms with nomrality distribution with normality curve included. Specifically, the histogram code I'm instrested in should have the following hallmarks:

Taking the variable 'age'

windows(width=7, height=7); par(lwd=1, las=1, family="sans", cex=1, 
  mgp=c(3.0,1,0))
hist2(d$age, freq=F, main="", xlab="age", ylab="", col="darkgray")
curve(dnorm(x, mean=mean(d$age[!is.na(d$age)]), 
  sd=sd(d$age[!is.na(d$age)])), add=T)
skewness.kurtosis(d$age)
ks.test(d$age, "pnorm", mean=mean(d$age[!is.na(d$age)]), 
  sd=sd(d$age[!is.na(d$age)]))

I'm just reporting here some of the observation of the dataset I'm working on:

   df_demo <- tibble::tibble(
  participant = rep("PZ101", 50),
  sex = sample(c("M", "F", "X"), 50, replace = TRUE),
  years = sample(20:40, 50, replace = TRUE),
  attention_time = sample(55:75, 50, replace = TRUE),
  phase = sample(c("prep", "focus", "test"), 50, replace = TRUE),
  trial_key = sample(c("A", "B", "C", "None"), 50, replace = TRUE),
  resp1 = sample(c(0, 1), 50, replace = TRUE),
  cond = sample(c("YES", "NO", "MAYBE"), 50, replace = TRUE),
  trial_rt = round(runif(50, 0.5, 1.2), 3),
  choice_key = sample(c("q", "r", "space"), 50, replace = TRUE),
  choice_rt = round(runif(50, 0.4, 1.0), 3),
  trial_id = 1:50,
  dots_L = sample(45:55, 50, replace = TRUE),
  dots_R = sample(45:55, 50, replace = TRUE),
  cue = sample(c("up", "down", "left", "right"), 50, replace = TRUE),
  signal_quality = round(runif(50, 0.1, 0.9), 2),
  session_code = sample(LETTERS[1:4], 50, replace = TRUE),
  is_repeat = sample(c(TRUE, FALSE), 50, replace = TRUE),
  accuracy = sample(c("low", "medium", "high"), 50, replace = TRUE),
  category = sample(c("A", "B", "C", "D"), 50, replace = TRUE)
)

Has anyone any clue? How should I keep on the afprementioned code? Furthermore I would like that the graphs might plotted together in the same window, not overlapped.

Solution

Okay, now, having your data, we can do it again.

We load your data

df=structure(list(ID = c("P1323", "P1323", "P1323", "P1323", "P1323", 
 "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
 "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
 "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
 "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
 "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
 "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
 "P1323", "P1323", "P1323"), gender = c("F", "F", "F", "F", "F", 
 "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", 
 "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", 
 "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", 
 "F", "F", "F", "F", "F", "F"), age = c(19, 19, 19, 19, 19, 19, 
 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 
 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 
 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19), fixation_time = c(60, 
 60, 60, 60, 60, 70, 50, 50, 50, 70, 70, 60, 50, 60, 70, 70, 50, 
 70, 70, 60, 70, 50, 50, 50, 60, 70, 60, 50, 60, 70, 60, 70, 50, 
 60, 70, 50, 50, 70, 70, 70, 70, 50, 60, 50, 60, 60, 70, 50, 60, 
 60), block = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), t1.key = c("None", "None", 
 "None", "space", "None", "space", "None", "None", "None", "space", 
 "None", "None", "space", "None", "None", "space", "None", "None", 
 "space", "None", "space", "space", "space", "None", "None", "None", 
 "space", "space", "None", "None", "space", "None", "None", "None", 
 "None", "None", "None", "space", "space", "None", "None", "None", 
 "None", "space", "None", "None", "space", "None", "space", "None"
 ), T1.response = c(0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 
 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 
 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0), COND = c("NR", 
 "NR", "NR", "R", "NR", "R", "NR", "NR", "NR", "R", "NR", "NR", 
 "R", "NR", "NR", "R", "NR", "NR", "R", "NR", "R", "R", "R", "NR", 
 "NR", "NR", "R", "R", "NR", "NR", "R", "NR", "NR", "NR", "NR", 
 "NR", "NR", "R", "R", "NR", "NR", "NR", "NR", "R", "NR", "NR", 
 "R", "NR", "R", "NR"), T1.rt = c(NA, NA, NA, 0.812299799988978, 
 NA, 0.72336569998879, NA, NA, NA, 0.772733500052709, NA, NA, 
 0.606754800013732, NA, NA, 0.601030899968464, NA, NA, 0.838272600027267, 
 NA, 0.305548300035298, 0.849945599969942, 0.748269900039304, 
 NA, NA, NA, 0.859215400007088, 0.95704890001798, NA, NA, 0.874362500035204, 
 NA, NA, NA, NA, NA, NA, 0.270455699996091, 0.75726039998699, 
 NA, NA, NA, NA, 0.762694000033662, NA, NA, 0.789715700026136, 
 NA, 0.90579859999707, NA), CR.key = c("p", "p", "p", "p", "p", 
 "p", "p", "p", "p", "p", "p", "p", "p", "p", "o", "p", "i", "i", 
 "h", "u", "i", "u", "o", "o", "p", "p", "p", "o", "p", "i", "o", 
 "p", "p", "p", "o", "o", "o", "p", "i", "p", "p", "o", "o", "i", 
 "i", "o", "o", "i", "i", "u"), CR.rt = c(0.651771800010465, 0.585048799985088, 
 0.652350199990906, 0.69888829998672, 1.01917029998731, 0.550036200031173, 
 0.0361186999944039, 0.568817299965303, 0.452191599993966, 0.514980700041633, 
 0.619590600021184, 0.719264700019266, 0.466181399999186, 0.45217840000987, 
 0.668881699966732, 0.914478300022893, 1.01910460001091, 1.40315000002738, 
 1.69993370003067, 1.71914210001705, 1.29938790004235, 0.698139799991623, 
 0.848338100011461, 0.651829700043891, 0.486136299965438, 0.703567499993369, 
 0.76673849998042, 0.54929809999885, 0.718664799991529, 0.768383099988569, 
 0.898415500007104, 0.819344500021543, 0.61898209998617, 0.737225699995179, 
 1.03654629999073, 0.971092400024645, 1.4362695000018, 0.999490200018045, 
 0.932840399967972, 0.586312200000975, 0.786785800009966, 1.01987839996582, 
 0.93673920002766, 0.715710600023158, 0.819960499997251, 0.75370900001144, 
 0.818668299994897, 0.903600800025742, 1.1176545000053, 1.10352450003847
 ), trial_num = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 
 14, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 
 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 
 50, 51, 52, 53), ldots = c(48, 48, 52, 55, 51, 51, 52, 49, 45, 
 55, 49, 49, 51, 49, 48, 52, 45, 49, 45, 55, 51, 48, 55, 51, 45, 
 45, 52, 48, 48, 48, 55, 51, 49, 48, 49, 51, 51, 55, 51, 49, 45, 
 55, 51, 55, 55, 52, 52, 48, 49, 52), rdots = c(52, 52, 48, 45, 
 49, 49, 48, 51, 55, 45, 51, 51, 49, 51, 52, 48, 55, 51, 55, 45, 
 49, 52, 45, 49, 55, 55, 48, 52, 52, 52, 45, 49, 51, 52, 51, 49, 
 49, 45, 49, 51, 55, 45, 49, 45, 45, 48, 48, 52, 51, 48), TASK = c("left", 
 "left", "left", "left", "left", "left", "left", "left", "left", 
 "left", "left", "left", "left", "left", "left", "left", "left", 
 "left", "left", "left", "left", "left", "left", "left", "left", 
 "left", "left", "left", "left", "left", "left", "left", "left", 
 "left", "left", "left", "left", "left", "left", "left", "left", 
 "left", "left", "left", "left", "left", "left", "left", "left", 
 "left"), T1.correct = c(0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 
 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 
 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1), Go.Nogo..whether.a.person.should.respond. = c("NR", 
 "NR", "R", "R", "R", "R", "R", "NR", "NR", "R", "NR", "NR", "R", 
 "NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "R", "R", 
 "NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "NR", "NR", 
 "R", "R", "R", "R", "NR", "NR", "R", "R", "R", "R", "R", "R", 
 "NR", "NR", "R"), T1.ACC = c(1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 
 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 
 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0), CR = c(4, 
 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 2, 2, 9, 1, 2, 1, 
 3, 3, 4, 4, 4, 3, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4, 2, 4, 4, 3, 3, 
 2, 2, 3, 3, 2, 2, 1), difficulty = c("medium", "medium", "medium", 
 "easy", "hard", "hard", "medium", "hard", "easy", "easy", "hard", 
 "hard", "hard", "hard", "medium", "medium", "easy", "hard", "easy", 
 "easy", "hard", "medium", "easy", "hard", "easy", "easy", "medium", 
 "medium", "medium", "medium", "easy", "hard", "hard", "medium", 
 "hard", "hard", "hard", "easy", "hard", "hard", "easy", "easy", 
 "hard", "easy", "easy", "medium", "medium", "medium", "hard", 
 "medium")), row.names = c(NA, -50L), class = c("tbl_df", "tbl", 
 "data.frame"))

Now we will change their representation so that the lines contain individual variable names and data (tibble in tibble)

library(tidyverse)
df1 = df %>% 
  pivot_longer(where(is.numeric), names_to = "variable", values_to = "value") %>% 
  group_by(variable) %>% 
  nest()

output df1

# A tibble: 12 x 2
# Groups:   variable [12]
   variable      data             
   <chr>         <list>           
 1 age           <tibble [50 x 9]>
 2 fixation_time <tibble [50 x 9]>
 3 block         <tibble [50 x 9]>
 4 T1.response   <tibble [50 x 9]>
 5 T1.rt         <tibble [50 x 9]>
 6 CR.rt         <tibble [50 x 9]>
 7 trial_num     <tibble [50 x 9]>
 8 ldots         <tibble [50 x 9]>
 9 rdots         <tibble [50 x 9]>
10 T1.correct    <tibble [50 x 9]>
11 T1.ACC        <tibble [50 x 9]>
12 CR            <tibble [50 x 9]>

What is such a tibble in tibble? Let's check.

df1$data[[6]] %>% select(ID, value)

output

# A tibble: 50 x 2
   ID     value
   <chr>  <dbl>
 1 P1323 0.652 
 2 P1323 0.585 
 3 P1323 0.652 
 4 P1323 0.699 
 5 P1323 1.02  
 6 P1323 0.550 
 7 P1323 0.0361
 8 P1323 0.569 
 9 P1323 0.452 
10 P1323 0.515 
# ... with 40 more rows

Well, since we have data in separate tibble, let's create a function that returns the statistics we are interested in.

add.statistic = function(data) {
  x = data$value[!is.na(data$value)] 
  suppressWarnings(tibble(
    n = length(x),
    nunique = length(unique(x)),
    mean = mean(x),
    sd = sd(x),
    skew = ifelse(nunique>1,moments::skewness(x), NA),
    kur = ifelse(nunique>1,moments::kurtosis(x), NA),
    ks.p = ifelse(nunique>1,stats::ks.test(x, "pnorm", mean, sd)$p.value, NA),
    ad.stat = ifelse(nunique>1,nortest::ad.test(x)$statistic, NA),
    ad.p = ifelse(nunique>1,nortest::ad.test(x)$p.value, NA),
    sw.stat = ifelse(nunique>1,stats::shapiro.test(x)$statistic, NA),
    sw.p = ifelse(nunique>1,stats::shapiro.test(x)$p.value, NA)
  ))
}

A few comments are needed here. Variables can contain all the same values. Then counting statistics like skewness, kurtosis, shapiro and the like is pointless. For this I added the ifelse function and the nunique variable there.

Furthermore, the test ks.test may generate warnings. For this I used suppressWarnings to silence it.

Now let's see how our add.statistic function works

add.statistic(df1$data[[6]])

output

# A tibble: 1 x 11
      n nunique  mean    sd  skew   kur  ks.p ad.stat    ad.p sw.stat    sw.p
  <int>   <int> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1    50      50 0.818 0.313 0.849  4.53 0.456    1.17 0.00421   0.928 0.00465

Bingo! We have statistics! Let's put it together now.

df1 = df %>% 
  pivot_longer(where(is.numeric), names_to = "variable", values_to = "value") %>% 
  group_by(variable) %>% 
  nest() %>% 
  mutate(stat = map(data, add.statistic)) 

df1 %>%  unnest(stat)

output

# A tibble: 12 x 13
# Groups:   variable [12]
   variable      data                  n nunique   mean     sd    skew   kur          ks.p ad.stat      ad.p sw.stat      sw.p
   <chr>         <list>            <int>   <int>  <dbl>  <dbl>   <dbl> <dbl>         <dbl>   <dbl>     <dbl>   <dbl>     <dbl>
 1 age           <tibble [50 x 9]>    50       1 19      0     NA      NA    NA             NA     NA         NA     NA       
 2 fixation_time <tibble [50 x 9]>    50       3 60.4    8.07  -0.0719  1.57  0.0139         3.90   7.17e-10   0.799  8.36e- 7
 3 block         <tibble [50 x 9]>    50       1  0      0     NA      NA    NA             NA     NA         NA     NA       
 4 T1.response   <tibble [50 x 9]>    50       2  0.34   0.479  0.676   1.46  0.0000000391  10.1    3.7 e-24   0.599  1.85e-10
 5 T1.rt         <tibble [50 x 9]>    17      17  0.731  0.191 -1.42    4.15  0.209          1.20   2.74e- 3   0.819  3.78e- 3
 6 CR.rt         <tibble [50 x 9]>    50      50  0.818  0.313  0.849   4.53  0.456          1.17   4.21e- 3   0.928  4.65e- 3
 7 trial_num     <tibble [50 x 9]>    50      50 26.5   16.0   -0.0204  1.78  0.854          0.582  1.23e- 1   0.952  4.28e- 2
 8 ldots         <tibble [50 x 9]>    50       6 50.2    3.05   0.0230  2.28  0.297          1.27   2.35e- 3   0.918  2.06e- 3
 9 rdots         <tibble [50 x 9]>    50       6 49.8    3.05  -0.0230  2.28  0.297          1.27   2.35e- 3   0.918  2.06e- 3
10 T1.correct    <tibble [50 x 9]>    50       2  0.52   0.505 -0.0801  1.01  0.0000101      8.84   8.98e-22   0.636  6.99e-10
11 T1.ACC        <tibble [50 x 9]>    50       2  0.66   0.479 -0.676   1.46  0.0000000391  10.1    3.7 e-24   0.599  1.85e-10
12 CR            <tibble [50 x 9]>    50       5  3.32   1.25   1.39    9.82  0.00112        3.66   2.77e- 9   0.755  9.19e- 8

Everything works great. So let's create a function that generates the graph. However, only for variables for which nunique> 1!

create.plot = function(df, group){
  stat = df$stat[[1]]
  data = df$data[[1]]
  hist.val = hist(data$value, 30)
  if(stat$nunique ==1) return(NULL)
  statlab = stat %>%
    pivot_longer(everything(), names_to = "stat", values_to = "val") %>%
    mutate(x=hist.val$breaks[2],
           y=seq(max(hist.val$density)*0.1,
                 max(hist.val$density)*0.7,
                 length.out=length(stat)),
           lab = paste(stat,":",signif(val,3)))
  data %>% ggplot(aes(value))+
    geom_histogram(aes(y=..density..), colour="black", fill="white")+
    geom_density(alpha=.2, fill="red", col="red")+
    stat_function(fun = dnorm, args = list(mean = stat$mean, sd = stat$sd),
                  col="blue")+
    geom_label(data=statlab, aes(x=x, y=y, label = lab))+
    xlab(group)
}

Let's check how our create.plot function works on the selected variable

create.plot(df1[6,], df1[6,1])

I don't know if you exactly expected it, but I assume you might like such a plot.

Now all we have to do is combine everything together in one neat command

df1 %>% group_by(variable) %>% 
  group_map(create.plot)

Below are some selected charts