I have not been able to find an example of bootstrapping correlations between two data frames. Key posts I have looked at are (1) https://stats.stackexchange.com/questions/20701/computing-p-value-using-bootstrap-with-r/ (2) How to bootstrap correlation using vectorised function applied to large matrix?
NB/ p values here need to be obtained through calculating the ratios of how many times the absolute bootstrap test statistics exceeded the theoretical ones,
Fortunately, I have also come across a recent example of using Map to apply the correlations between two data frames. How to use lapply to replace nested for loop to get correlations between two data frames?
I have large datasets that will be run in a unix based HPC and also a Windows OS option for running the calculations on smaller datasets in R.
D1 <- data.frame(matrix(runif(10*10, 0, 2), ncol=10))
D2 <- data.frame(matrix(runif(10*16, 0, 2), ncol=16))
colnames(D1) <- paste0("a", 1:ncol(D1))
colnames(D2) <- paste0("b", 1:ncol(D2))
compare <- expand.grid(colnames(D1), colnames(D2))
need_modify <- Map(function(x,y) cor.test(D1[, x], D2[, y], method="spearman"), compare$Var1, compare$Var2) %>%
lapply(`[`, c('estimate', 'statistic', 'p.value')) %>%
sapply(unlist) %>% t() %>% as.data.frame() %>% mutate(Var1=compare$Var1, Var2=compare$Var2)
boot_df <- function(x) x[sample(nrow(x), rep = T), ]
#number of bootstraps
R <- 100
I am stuck on modifying the above so that it run successfully using parallelisation for Unix based OS (mclapply or mcMap) and also a separate one for Windows (clusterMap or future_mapply).
Grateful for any pointers in the right direction or an example elsewhere.
Technically you could resample B times with replacement, extract the "statistic"
s,
stopifnot(identical(dim(D1)[1], dim(D2)[1])) ## check identical nrows
r.names <- Reduce(\(...) paste(..., sep=':'), Cmp) ## store Cmp names
ncpu <- 7L ## number of workers
B <- 999L ## number of bootstrap replications
set.seed(42) ## for sake of reproducibility
T_star <- parallel::mcmapply(function(x, y) {
boot <- function() { ## boot fun
smp <- sample.int(dim(D1)[1], dim(D1)[1], ## resample w/ replacement
replace=TRUE)
unlist(cor.test(D1[smp, x], D2[smp, y], method="spearman", exact=FALSE)[
c('statistic')]) ## extract statistic
}
replicate(B, boot()) ## replicate boot fun B times
}, Cmp$Var1, Cmp$Var2, mc.cores=ncpu, mc.set.seed=FALSE, SIMPLIFY=FALSE) |> setNames(r.names)
lapply(T_star, head)[1:3]
# $`a1:b1`
# statistic.S statistic.S statistic.S statistic.S statistic.S statistic.S
# 212.4375 208.3125 205.7407 278.1132 216.5625 233.5714
#
# $`a2:b1`
# statistic.S statistic.S statistic.S statistic.S statistic.S statistic.S
# 270.18750 195.93750 258.70370 130.75472 216.56250 77.14286
#
# $`a3:b1`
# statistic.S statistic.S statistic.S statistic.S statistic.S statistic.S
# 144.3750 270.1875 220.0000 220.0000 198.0000 158.5714
and according to Davison & Hinkley (1997)'s formula,
calculate a Monte Carlo P-value add up how many times the actual statistic,
T_act <- parallel::mcmapply(\(x, y) cor.test(D1[, x], D2[, y], meth='s'),
Cmp$Var1, Cmp$Var2, mc.cores=ncpu, SIMPLIFY=FALSE) |>
lapply(`[[`, c('statistic')) |>
sapply(unlist) |> setNames(r.names)
T_act[1:3]
# a1:b1 a2:b1 a3:b1
# 210 156 232
is exceeded divided by B (using a bias correction by adding +1
to both, the numerator and denominator).
p_values <- parallel::mcmapply(\(x, y) (sum(abs(x) >= abs(y)) + 1)/(B + 1),
T_star, T_act, mc.cores=ncpu) |>
setNames(r.names) |> unlist()
p_values[1:3]
# a1:b1 a2:b1 a3:b1
# 0.507 0.477 0.490
Of course, we can combine everything and simplify to:
ncpu <- 7L ## number of workers
B <- 999L ## number of bootstrap replications
set.seed(42) ## for sake of reproducibility
p_values2 <- parallel::mcmapply(\(x, y) {
boot <- \() { ## boot fun
smp <- sample.int(dim(D1)[1], dim(D1)[1], ## resample w/ replacement
replace=TRUE)
unlist(cor.test(D1[smp, x], D2[smp, y], method="spearman", exact=FALSE)[
c('statistic')]) ## extract statistic
}
t_star <- replicate(B, boot()) ## replicate boot fun B times
t_act <- unlist(cor.test(D1[, x], D2[, y], method="spearman", exact=FALSE)[
c('statistic')])
(sum(abs(t_star) >= abs(t_act)) + 1)/(B + 1) ## calc. MC p-value
}, Cmp$Var1, Cmp$Var2, mc.cores=ncpu, mc.set.seed=FALSE) |> unlist() |>
setNames(r.names)
head(p_values2, 3)
# a1:b1 a2:b1 a3:b1
# 0.507 0.477 0.490
Note, that the answer focuses on how to do it technically. Read Davison & Hinkley (1997) or consult a statistician if you really want it to be sound.
Data:
set.seed(42)
D1 <- data.frame(matrix(runif(10*10, 0, 2), ncol=10)) |>
setNames(paste0("a", seq_len(10)))
D2 <- data.frame(matrix(runif(10*16, 0, 2), ncol=16)) |>
setNames(paste0("b", seq_len(16)))
Cmp <- expand.grid(colnames(D1), colnames(D2))