I am trying to add a progress bar to a bootstrap function in R. I tried to make the example function as simple as possible (hence i'm using mean in this example).
library(boot)
v1 <- rnorm(1000)
rep_count = 1
m.boot <- function(data, indices) {
d <- data[indices]
setWinProgressBar(pb, rep_count)
rep_count <- rep_count + 1
Sys.sleep(0.01)
mean(d, na.rm = T)
}
tot_rep <- 200
pb <- winProgressBar(title = "Bootstrap in progress", label = "",
min = 0, max = tot_rep, initial = 0, width = 300)
b <- boot(v1, m.boot, R = tot_rep)
close(pb)
The bootstrap functions properly, but the problem is that the value of rep_count
does not increase in the loop and the progress bar stays frozen during the process.
If I check the value of rep_count
after the bootstrap is complete, it is still 1.
What am i doing wrong? maybe the boot function does not simply insert the m.boot
function in a loop and so the variables in it are not increased?
Thank you.
The pbapply package was designed to work with vectorized functions. There are 2 ways to achieve that in the context of this question: (1) write a wrapper as was suggested, which will not produce the same object of class 'boot'
; (2) alternatively, the line lapply(seq_len(RR), fn)
can be written as pblapply(seq_len(RR), fn)
. Option 2 can happen either by locally copying/updating the boot
function as shown in the example below, or asking the package maintainer, Brian Ripley, if he would consider adding a progress bar directly or through pbapply as dependency.
My solution (changes indicated by comments):
library(boot)
library(pbapply)
boot2 <- function (data, statistic, R, sim = "ordinary", stype = c("i",
"f", "w"), strata = rep(1, n), L = NULL, m = 0, weights = NULL,
ran.gen = function(d, p) d, mle = NULL, simple = FALSE, ...,
parallel = c("no", "multicore", "snow"), ncpus = getOption("boot.ncpus",
1L), cl = NULL)
{
call <- match.call()
stype <- match.arg(stype)
if (missing(parallel))
parallel <- getOption("boot.parallel", "no")
parallel <- match.arg(parallel)
have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) {
if (parallel == "multicore")
have_mc <- .Platform$OS.type != "windows"
else if (parallel == "snow")
have_snow <- TRUE
if (!have_mc && !have_snow)
ncpus <- 1L
loadNamespace("parallel")
}
if (simple && (sim != "ordinary" || stype != "i" || sum(m))) {
warning("'simple=TRUE' is only valid for 'sim=\"ordinary\", stype=\"i\", n=0', so ignored")
simple <- FALSE
}
if (!exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE))
runif(1)
seed <- get(".Random.seed", envir = .GlobalEnv, inherits = FALSE)
n <- NROW(data)
if ((n == 0) || is.null(n))
stop("no data in call to 'boot'")
temp.str <- strata
strata <- tapply(seq_len(n), as.numeric(strata))
t0 <- if (sim != "parametric") {
if ((sim == "antithetic") && is.null(L))
L <- empinf(data = data, statistic = statistic, stype = stype,
strata = strata, ...)
if (sim != "ordinary")
m <- 0
else if (any(m < 0))
stop("negative value of 'm' supplied")
if ((length(m) != 1L) && (length(m) != length(table(strata))))
stop("length of 'm' incompatible with 'strata'")
if ((sim == "ordinary") || (sim == "balanced")) {
if (isMatrix(weights) && (nrow(weights) != length(R)))
stop("dimensions of 'R' and 'weights' do not match")
}
else weights <- NULL
if (!is.null(weights))
weights <- t(apply(matrix(weights, n, length(R),
byrow = TRUE), 2L, normalize, strata))
if (!simple)
i <- index.array(n, R, sim, strata, m, L, weights)
original <- if (stype == "f")
rep(1, n)
else if (stype == "w") {
ns <- tabulate(strata)[strata]
1/ns
}
else seq_len(n)
t0 <- if (sum(m) > 0L)
statistic(data, original, rep(1, sum(m)), ...)
else statistic(data, original, ...)
rm(original)
t0
}
else statistic(data, ...)
pred.i <- NULL
fn <- if (sim == "parametric") {
ran.gen
data
mle
function(r) {
dd <- ran.gen(data, mle)
statistic(dd, ...)
}
}
else {
if (!simple && ncol(i) > n) {
pred.i <- as.matrix(i[, (n + 1L):ncol(i)])
i <- i[, seq_len(n)]
}
if (stype %in% c("f", "w")) {
f <- freq.array(i)
rm(i)
if (stype == "w")
f <- f/ns
if (sum(m) == 0L)
function(r) statistic(data, f[r, ], ...)
else function(r) statistic(data, f[r, ], pred.i[r,
], ...)
}
else if (sum(m) > 0L)
function(r) statistic(data, i[r, ], pred.i[r, ],
...)
else if (simple)
function(r) statistic(data, index.array(n, 1, sim,
strata, m, L, weights), ...)
else function(r) statistic(data, i[r, ], ...)
}
RR <- sum(R)
res <- if (ncpus > 1L && (have_mc || have_snow)) {
if (have_mc) {
parallel::mclapply(seq_len(RR), fn, mc.cores = ncpus)
}
else if (have_snow) {
list(...)
if (is.null(cl)) {
cl <- parallel::makePSOCKcluster(rep("localhost",
ncpus))
if (RNGkind()[1L] == "L'Ecuyer-CMRG")
parallel::clusterSetRNGStream(cl)
res <- pblapply(seq_len(RR), fn, cl=cl)
parallel::stopCluster(cl)
res
}
else pblapply(seq_len(RR), fn, cl=cl)
}
}
else pblapply(seq_len(RR), fn) #### changed !!!
t.star <- matrix(, RR, length(t0))
for (r in seq_len(RR)) t.star[r, ] <- res[[r]]
if (is.null(weights))
weights <- 1/tabulate(strata)[strata]
boot.return(sim, t0, t.star, temp.str, R, data, statistic,
stype, call, seed, L, m, pred.i, weights, ran.gen, mle)
}
## Functions not exported by boot
isMatrix <- boot:::isMatrix
index.array <- boot:::index.array
boot.return <- boot:::boot.return
## Now the example
m.boot <- function(data, indices) {
d <- data[indices]
mean(d, na.rm = T)
}
tot_rep <- 200
v1 <- rnorm(1000)
b <- boot2(v1, m.boot, R = tot_rep)