rdplyrbroom

broom::tidy and dplyr::group_modify in R: Loping functions across variables grouped by species


I am trying to get a data frame of t.test outputs grouped by species for many variables. I have a subset of data that looks like this;

structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), levels = c("Apetahia longistigmata", 
"Clermontia fauriei", "Cyanea fissa", "Cyanea hardyi", "Lobelia gregoriana", 
"Lobelia hypoleuca", "Lobelia lukwangulensis", "Trematolobelia kauaiensis"
), class = "factor"), Experiment = structure(c(1L, 1L, 1L, 2L, 
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), levels = c("standard 1", 
"standard 2"), class = "factor"), Alpha.peak.1b = c(-78.35, -78.34, 
-78.84, -78.85, -78.01, -78.17, -79.17, -79.01, -79.01, -79.18, 
-79.18, -79.72, -77.85, -77.86, -77.69, -78.01, -77.86), Alpha.recrystalization.peak = c(-70.34, 
-78.34, -70.83, -70.67, -70.67, -71.17, -71, -71.17, -70.67, 
-70.84, -71.01, -70.67, -70.67, -70.68, -70.84, -70.67, -70.68
), Beta.peak.1a = c(-39.37, -39.63, -39.85, -39.53, -39.69, -39.85, 
-40.52, -40.52, -40.69, -40.86, -40.69, -40.68, -38.87, -38.56, 
-38.72, -38.86, -38.39)), row.names = c(NA, -17L), class = c("tbl_df", 
"tbl", "data.frame"))

I am trying the following;

df.ttest<-df %>%
  group_by(Species) %>%
  group_modify(~ broom::tidy(df[,3:ncol(df)], 2, function(x) t.test(x ~ Experiment, data = df))) 

On my subset of data i get the following error;

Error: unexpected ',' in "ction(x) t.test(x ~ Experiment, data = df))) 

However, on my full data, I get an output that looks like this;

structure(list(Species = structure(c(1L, 1L, 1L), levels = c("Apetahia longistigmata", 
"Clermontia fauriei", "Cyanea fissa", "Cyanea hardyi", "Lobelia gregoriana", 
"Lobelia hypoleuca", "Lobelia lukwangulensis", "Trematolobelia kauaiensis"
), class = "factor"), column = c("Alpha.peak.1b", "Alpha.recrystalization.peak", 
"Beta.peak.1a"), n = c(Alpha.peak.1b = 47, Alpha.recrystalization.peak = 47, 
Beta.peak.1a = 47), mean = c(Alpha.peak.1b = -78.5808510638298, 
Alpha.recrystalization.peak = -71.0140425531915, Beta.peak.1a = -39.4
), sd = c(Alpha.peak.1b = 0.577779970860107, Alpha.recrystalization.peak = 1.18348905612851, 
Beta.peak.1a = 0.869930032169005), median = c(Alpha.peak.1b = -78.68, 
Alpha.recrystalization.peak = -70.84, Beta.peak.1a = -39.52), 
    trimmed = c(Alpha.peak.1b = -78.5807692307692, Alpha.recrystalization.peak = -70.8828205128205, 
    Beta.peak.1a = -39.4182051282051), mad = c(Alpha.peak.1b = 0.490000000000009, 
    Alpha.recrystalization.peak = 0.170000000000002, Beta.peak.1a = 0.669999999999995
    ), min = c(Alpha.peak.1b = -79.72, Alpha.recrystalization.peak = -78.34, 
    Beta.peak.1a = -40.86), max = c(Alpha.peak.1b = -77.52, Alpha.recrystalization.peak = -69.51, 
    Beta.peak.1a = -37.69), range = c(Alpha.peak.1b = 2.2, Alpha.recrystalization.peak = 8.83, 
    Beta.peak.1a = 3.17), skew = c(Alpha.peak.1b = 0.027222651634338, 
    Alpha.recrystalization.peak = -5.13007598432232, Beta.peak.1a = 0.112391518548281
    ), kurtosis = c(Alpha.peak.1b = 2.00457929376749, Alpha.recrystalization.peak = 32.7138551153747, 
    Beta.peak.1a = 1.9708376743257), se = c(Alpha.peak.1b = 0.0842778705371632, 
    Alpha.recrystalization.peak = 0.172629621110036, Beta.peak.1a = 0.126892336746095
    )), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), groups = structure(list(Species = structure(1L, levels = c("Apetahia longistigmata", 
"Clermontia fauriei", "Cyanea fissa", "Cyanea hardyi", "Lobelia gregoriana", 
"Lobelia hypoleuca", "Lobelia lukwangulensis", "Trematolobelia kauaiensis"
), class = "factor"), .rows = structure(list(1:3), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))

EDIT for better expected outcome: I am expecting an output that has the t.test output but also columns for species and variables, respectively. For example;

structure(list(Species = "Species x", Variable = "Variable x", 
    estimate = 0.10705965418003, estimate1 = 0.460507391694923, 
    estimate2 = 0.353447737514893, statistic = c(t = 1.5638634504756), 
    p.value = 0.198127496637623, parameter = c(df = 3.72179497232624), 
    conf.low = -0.0887480304981188, conf.high = 0.30286733885818, 
    method = "Welch Two Sample t-test", alternative = "two.sided"), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"))

Any help is greatly appreciated!


Solution

  • Here is another approach using dplyr::rowwise() with nest_by() and expand_grid(). I like having a final data.frame which contains inputs and output and we can verify each piece (the formula form, the data and the variables vars).

    library(tidyverse)
    
    dv <- df %>%
      select(-c(Species, Experiment)) %>%
      colnames()
    
    df |> 
      nest_by(Species) %>% 
      expand_grid(vars = dv) %>% 
      rowwise() %>% 
      mutate(form = list(reformulate("Experiment", vars)),
             res  = list(t.test(form, data = data) %>% 
                           broom::tidy()) 
               
      ) %>% 
      unnest(res)
    
    #> # A tibble: 9 x 14
    #>   Species           data vars  form      estim~1 estim~2 estim~3 stati~4 p.value
    #>   <fct>          <list<> <chr> <list>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
    #> 1 Apetahia long~ [6 x 4] Alph~ <formula> -0.167    -78.5   -78.3  -0.545   0.619
    #> 2 Apetahia long~ [6 x 4] Alph~ <formula> -2.33     -73.2   -70.8  -0.899   0.463
    #> 3 Apetahia long~ [6 x 4] Beta~ <formula>  0.0733   -39.6   -39.7   0.440   0.686
    #> 4 Clermontia fa~ [6 x 4] Alph~ <formula>  0.297    -79.1   -79.4   1.58    0.236
    #> 5 Clermontia fa~ [6 x 4] Alph~ <formula> -0.107    -70.9   -70.8  -0.604   0.583
    #> 6 Clermontia fa~ [6 x 4] Beta~ <formula>  0.167    -40.6   -40.7   2.05    0.110
    #> 7 Cyanea fissa   [5 x 4] Alph~ <formula>  0.135    -77.8   -77.9   1.45    0.280
    #> 8 Cyanea fissa   [5 x 4] Alph~ <formula> -0.0550   -70.7   -70.7  -0.995   0.423
    #> 9 Cyanea fissa   [5 x 4] Beta~ <formula> -0.0917   -38.7   -38.6  -0.365   0.766
    #> # ... with 5 more variables: parameter <dbl>, conf.low <dbl>, conf.high <dbl>,
    #> #   method <chr>, alternative <chr>, and abbreviated variable names
    #> #   1: estimate, 2: estimate1, 3: estimate2, 4: statistic
    

    Data from OP

    df <- structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
                                         2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), levels = c("Apetahia longistigmata", 
                                                                                                 "Clermontia fauriei", "Cyanea fissa", "Cyanea hardyi", "Lobelia gregoriana", 
                                                                                                 "Lobelia hypoleuca", "Lobelia lukwangulensis", "Trematolobelia kauaiensis"
                                         ), class = "factor"), Experiment = structure(c(1L, 1L, 1L, 2L, 
                                                                                        2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), levels = c("standard 1", 
                                                                                                                                                        "standard 2"), class = "factor"), Alpha.peak.1b = c(-78.35, -78.34, 
                                                                                                                                                                                                            -78.84, -78.85, -78.01, -78.17, -79.17, -79.01, -79.01, -79.18, 
                                                                                                                                                                                                            -79.18, -79.72, -77.85, -77.86, -77.69, -78.01, -77.86), Alpha.recrystalization.peak = c(-70.34, 
                                                                                                                                                                                                                                                                                                     -78.34, -70.83, -70.67, -70.67, -71.17, -71, -71.17, -70.67, 
                                                                                                                                                                                                                                                                                                     -70.84, -71.01, -70.67, -70.67, -70.68, -70.84, -70.67, -70.68
                                                                                                                                                                                                            ), Beta.peak.1a = c(-39.37, -39.63, -39.85, -39.53, -39.69, -39.85, 
                                                                                                                                                                                                                                -40.52, -40.52, -40.69, -40.86, -40.69, -40.68, -38.87, -38.56, 
                                                                                                                                                                                                                                -38.72, -38.86, -38.39)), row.names = c(NA, -17L), class = c("tbl_df", 
                                                                                                                                                                                                                                                                                             "tbl", "data.frame"))
    

    Created on 2023-03-06 by the reprex package (v2.0.1)