I have the following dataset to calculate standardized effect size for Soil_N and Soil_P for which I used the code below for each replicate.
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))
As value of one replicate from both Soil_N and Soil_P is missing, I would like to calculate the value of remaining 4 replicates to create a value for the missing replicate as per code below.
library(dplyr)
library(tidyr)
df %>%
pivot_wider(id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = c(Soil_N, Soil_P) %>%
mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced),
ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced))%>%
group_by(Diversity)
I am sure that this code requires to replace the mean value for Soil_N and Soil_P separately in their respective column but I don't know how.
Followed by this, I am willing to calculate the bootstrap sample from the grouped replicates, grouped by each Diversity with 1000 iterations (meaning that each ES calculation goes for 1000 times using 5 replicates using the formula ES = (optimal-reduced)/(optimal+reduced)). This should be done for both Soil_N and Soil_P.
For bootstrap, I am using the code
df_bs<- bootstraps(df, times = 1000, strata = "Diversity") %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(meanN = mean(ES_N),
meanP = mean(ES_P))
})) %>%
unnest(means)
This output from here, I would pivot_longer
as following to make a graph in ggplot2.
df_final<- df_bs%>%
select(meanN, meanP)%>%
pivot_longer(!Diversity,
names_to = "variables",
values_to = "ES")
df_final
Your help is highly appreciated! Thanks!
I think you mean to bootstrap sample from the grouped replicates, grouped by each Diversity
category? bootstraps
takes the whole dataframe as an argument to sample from. To borrow the code from the examples here, you could sample 5 random replicates from each Diversity category (with replacement) and for each bootstrap resampling extract the means of each category:
library(tidyverse)
library(rsample)
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5)
)
df_stand <- df %>%
pivot_wider(
id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = Soil_N
) %>%
mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES = (optimal - reduced) / (optimal + reduced)
)
bootstraps(df_stand, times = 50, strata = "Diversity") %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(mean = mean(ES))
})) %>%
unnest(means)
#> # A tibble: 100 x 4
#> splits id Diversity mean
#> <list> <chr> <chr> <dbl>
#> 1 <split [10/3]> Bootstrap01 high -0.165
#> 2 <split [10/3]> Bootstrap01 low -0.0868
#> 3 <split [10/3]> Bootstrap02 high -0.173
#> 4 <split [10/3]> Bootstrap02 low -0.0906
#> 5 <split [10/4]> Bootstrap03 high -0.167
#> 6 <split [10/4]> Bootstrap03 low -0.103
#> 7 <split [10/3]> Bootstrap04 high -0.184
#> 8 <split [10/3]> Bootstrap04 low -0.0881
#> 9 <split [10/4]> Bootstrap05 high -0.159
#> 10 <split [10/4]> Bootstrap05 low -0.0633
#> # ... with 90 more rows
Your solution is almost there. With the new column of Soil_P
added in, your across
function needs to select all the new columns after the pivot. You can use across(starts_with("Soil"))
to get all the right ones:
library(tidyverse)
library(rsample)
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))
df_stand <- df %>%
pivot_wider(
id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = c(Soil_N, Soil_P)
) %>%
mutate(
across(starts_with("Soil"), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced),
ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced)
)
df_bs <- bootstraps(df_stand, times = 1000, strata = 'Diversity') %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(meanN = mean(ES_N),
meanP = mean(ES_P))
})) %>%
unnest(means)
df_final <- df_bs %>%
select(Diversity, meanN, meanP) %>%
pivot_longer(-Diversity,
names_to = "variables",
values_to = "ES")
df_final
#> # A tibble: 4,000 x 3
#> Diversity variables ES
#> <chr> <chr> <dbl>
#> 1 high meanN -0.177
#> 2 high meanP -0.48
#> 3 low meanN -0.0611
#> 4 low meanP -0.241
#> 5 high meanN -0.164
#> 6 high meanP -0.68
#> 7 low meanN -0.0787
#> 8 low meanP -0.299
#> 9 high meanN -0.187
#> 10 high meanP -0.44
#> # ... with 3,990 more rows
This outputs the dataframe with the mean values of each of the 1,000 bootstrap samples for each value of Diversity
. And then it's just a case of plotting as needed :).
Created on 2022-03-18 by the reprex package (v2.0.1)