r function loops functional-programming kruskal-wallis

Guidelines for specifying variables in a function

Consider dat1 created here:

set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200))

dat1 has measurements for 5 variables, and observations (IDs) can be grouped according to 3 grouping variables: Loc, State, and Region I am having to perform various tasks on each response variable/grouping variable combination, so I have been writing functions to make it easier, and keep my analysis tidy. I am using the rstatix package to do several operations. The following function will conduct a Kruskal Wallis test on the data I specify, calculate the effect size efsz and return the results in a single data frame res:

library(rstatix)
KruskTest <- function(dat, groupvar, var){
  kt <- dat%>%kruskal_test(get(var) ~ get(groupvar))
  efsz <- dat%>%kruskal_effsize(get(var) ~ get(groupvar))
  res <<- cbind(kt, efsz[,3:5])
  res[1,1] <<- var
  res$groupvar <<- groupvar 
  res <<- res[,c(10,1:9)]
}
KruskTest(dat=dat1, groupvar = "Region", var = "var1")

Now I can use that function to loop over each response variable and get the results for a grouping variable (example shows it for Region) in a single data frame, which is what I need:

vars <- paste(names(dat1[,5:9]))
a <- data.frame()
for(i in vars){
  KruskTest(dat=dat1, groupvar="Region", var= i)
  a <- rbind(a, res)
}

That works great for the Kruskal Wallis test, now I want to make a very similar function that will do a duns test, but watch what happens:

dunn <- function(dat, groupvar, var){
  res <<- dat%>%rstatix::dunn_test(get(var) ~ get(groupvar), p.adjust.method = "bonferroni")
}
dunn(dat=dat1, groupvar="Region", var = "var1")

r:Error: Can't extract columns that don't exist. x The column `get(groupvar)` doesn't exist.

Outside of a user-written function, you specify data for the dunn_test() and kruskal_test() the exact same way. So what is the difference between specifying variables in these two funcitons, and why does the first one work but not the second?

Solution

Taking into account @Gregor 's comments about not writing to the environment and trying to clean up some other rough edges's I have a proposed improvement although Gregor is correct your biggest problem was nothing but a typo.

library(rstatix)
library(purrr)

# rewritten to avoid writing to environment

NewKruskTest <- function(dat, groupvar, var) {
  kt <- dat %>% kruskal_test(as.formula(paste(var, "~", groupvar)))
  efsz <- dat %>% kruskal_effsize(as.formula(paste(var, "~", groupvar)))
  results <- cbind(kt, efsz[,3:5])
  results$groupvar <- groupvar 
  results <- results[,c(10,1:9)]
  return(results)
}

# works on a single if you want to test
# NewKruskTest(dat = dat1, groupvar = "Region", var = "var1") 

# No paste needed
vars <- names(dat1[,5:9])

# NewKruskTest will work in your existing for loop but you 
# may find `purrr:map_dfr` cleaner

map_dfr(vars, ~ NewKruskTest(dat = dat1, groupvar = "Region", var = .))
#>   groupvar  .y.   n statistic df      p         method      effsize method.1
#> 1   Region var1 200 3.0520896  1 0.0806 Kruskal-Wallis  0.010364089  eta2[H]
#> 2   Region var2 200 0.5961552  1 0.4400 Kruskal-Wallis -0.002039620  eta2[H]
#> 3   Region var3 200 1.6330090  1 0.2010 Kruskal-Wallis  0.003197015  eta2[H]
#> 4   Region var4 200 3.4031343  1 0.0651 Kruskal-Wallis  0.012137042  eta2[H]
#> 5   Region var5 200 0.7230090  1 0.3950 Kruskal-Wallis -0.001398945  eta2[H]
#>   magnitude
#> 1     small
#> 2     small
#> 3     small
#> 4     small
#> 5     small

# NewDunn rewritten

NewDunn <- function(dat, groupvar, var) {
  results <- dat %>% rstatix::dunn_test(as.formula(paste(var, "~", groupvar)), 
                        p.adjust.method = "bonferroni")
  results$groupvar <- groupvar 
  results <- results[,c(10,1:9)]
  return(results)
}

# works on a single if you want to test
# NewDunn(dat=dat1, groupvar ="Region", var = "var1")

map_dfr(vars, ~ NewDunn(dat = dat1, groupvar = "Region", var = .))
#> # A tibble: 5 x 10
#>   groupvar .y.   group1 group2    n1    n2 statistic      p  p.adj p.adj.signif
#>   <chr>    <chr> <chr>  <chr>  <int> <int>     <dbl>  <dbl>  <dbl> <chr>       
#> 1 Region   var1  r1     r2       100   100    -1.75  0.0806 0.0806 ns          
#> 2 Region   var2  r1     r2       100   100    -0.772 0.440  0.440  ns          
#> 3 Region   var3  r1     r2       100   100    -1.28  0.201  0.201  ns          
#> 4 Region   var4  r1     r2       100   100     1.84  0.0651 0.0651 ns          
#> 5 Region   var5  r1     r2       100   100    -0.850 0.395  0.395  ns

based on your data


set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200))