rdplyrpurrrrsample

calculating bootstrap resampling for grouped variables


I have the following dataset to calculate standardized effect size for Soil_N and Soil_P for which I used the code below for each replicate.

  df <- tibble(
  Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
  Diversity = rep(c("high", "low"), each = 10),
  Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89), 
  Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
  Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))

As value of one replicate from both Soil_N and Soil_P is missing, I would like to calculate the value of remaining 4 replicates to create a value for the missing replicate as per code below.

library(dplyr)
library(tidyr)
df %>%
 pivot_wider(id_cols = c(Diversity, Replicate),
 names_from = Soilwater,
 values_from = c(Soil_N, Soil_P) %>%
 mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))), 
  ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced), 
  ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced))%>% 
  group_by(Diversity)

I am sure that this code requires to replace the mean value for Soil_N and Soil_P separately in their respective column but I don't know how.
Followed by this, I am willing to calculate the bootstrap sample from the grouped replicates, grouped by each Diversity with 1000 iterations (meaning that each ES calculation goes for 1000 times using 5 replicates using the formula ES = (optimal-reduced)/(optimal+reduced)). This should be done for both Soil_N and Soil_P.

For bootstrap, I am using the code

df_bs<- bootstraps(df, times = 1000, strata = "Diversity") %>%
mutate(means = map(splits, function(x) {
 as.data.frame(x) %>%
  group_by(Diversity) %>%
  summarise(meanN = mean(ES_N), 
  meanP = mean(ES_P))
  })) %>%
  unnest(means)

This output from here, I would pivot_longer as following to make a graph in ggplot2.

 df_final<- df_bs%>%
 select(meanN, meanP)%>%
 pivot_longer(!Diversity, 
           names_to = "variables", 
           values_to = "ES")
 df_final

Your help is highly appreciated! Thanks!


Solution

  • I think you mean to bootstrap sample from the grouped replicates, grouped by each Diversity category? bootstraps takes the whole dataframe as an argument to sample from. To borrow the code from the examples here, you could sample 5 random replicates from each Diversity category (with replacement) and for each bootstrap resampling extract the means of each category:

    library(tidyverse)
    library(rsample)
    
    df <- tibble(
      Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
      Diversity = rep(c("high", "low"), each = 10),
      Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
      Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5)
    )
    
    
    df_stand <- df %>%
      pivot_wider(
        id_cols = c(Diversity, Replicate),
        names_from = Soilwater,
        values_from = Soil_N
      ) %>%
      mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
        ES = (optimal - reduced) / (optimal + reduced)
      )
    
    
    bootstraps(df_stand, times = 50, strata = "Diversity") %>%
      mutate(means = map(splits, function(x) {
        as.data.frame(x) %>%
          group_by(Diversity) %>%
          summarise(mean = mean(ES))
      })) %>%
      unnest(means)
    
    
    #> # A tibble: 100 x 4
    #>    splits         id          Diversity    mean
    #>    <list>         <chr>       <chr>       <dbl>
    #>  1 <split [10/3]> Bootstrap01 high      -0.165 
    #>  2 <split [10/3]> Bootstrap01 low       -0.0868
    #>  3 <split [10/3]> Bootstrap02 high      -0.173 
    #>  4 <split [10/3]> Bootstrap02 low       -0.0906
    #>  5 <split [10/4]> Bootstrap03 high      -0.167 
    #>  6 <split [10/4]> Bootstrap03 low       -0.103 
    #>  7 <split [10/3]> Bootstrap04 high      -0.184 
    #>  8 <split [10/3]> Bootstrap04 low       -0.0881
    #>  9 <split [10/4]> Bootstrap05 high      -0.159 
    #> 10 <split [10/4]> Bootstrap05 low       -0.0633
    #> # ... with 90 more rows
    

    Edit

    Your solution is almost there. With the new column of Soil_P added in, your across function needs to select all the new columns after the pivot. You can use across(starts_with("Soil")) to get all the right ones:

    library(tidyverse)
    library(rsample)
    
    df <- tibble(
      Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
      Diversity = rep(c("high", "low"), each = 10),
      Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89), 
      Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
      Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))
    
    
    
    df_stand <- df %>%
      pivot_wider(
        id_cols = c(Diversity, Replicate),
        names_from = Soilwater,
        values_from = c(Soil_N, Soil_P)
      ) %>%
      mutate(
        across(starts_with("Soil"), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
        ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced),
        ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced)
      )
    
    
    df_bs <- bootstraps(df_stand, times = 1000, strata = 'Diversity') %>%
      mutate(means = map(splits, function(x) {
        as.data.frame(x) %>%
          group_by(Diversity) %>%
          summarise(meanN = mean(ES_N),
                    meanP = mean(ES_P))
      })) %>%
      unnest(means)
    
    df_final <- df_bs %>% 
      select(Diversity, meanN, meanP) %>% 
      pivot_longer(-Diversity,
                   names_to = "variables", 
                   values_to = "ES")
    
    df_final
    
    #> # A tibble: 4,000 x 3
    #>    Diversity variables      ES
    #>    <chr>     <chr>       <dbl>
    #>  1 high      meanN     -0.177 
    #>  2 high      meanP     -0.48  
    #>  3 low       meanN     -0.0611
    #>  4 low       meanP     -0.241 
    #>  5 high      meanN     -0.164 
    #>  6 high      meanP     -0.68  
    #>  7 low       meanN     -0.0787
    #>  8 low       meanP     -0.299 
    #>  9 high      meanN     -0.187 
    #> 10 high      meanP     -0.44  
    #> # ... with 3,990 more rows
    

    This outputs the dataframe with the mean values of each of the 1,000 bootstrap samples for each value of Diversity. And then it's just a case of plotting as needed :).

    Created on 2022-03-18 by the reprex package (v2.0.1)