I built a JAGS model using the R2jags
package to fit a quadratic function to empirical data and estimate the parameters of the function. I would like to use this model to predict new observations from a new data frame coming from a raster layer. The problem is that I don't know how to do it.
First attempt: I tried to apply the code below from the 'First attempt' section, but it doesn't seem correct to me (see the figure below).
I was expecting there to be more noise in the data, meaning more fluctuations in the data.
Second attempt: I then reran the model from the 'Second attempt' section using a new data frame where I created a column y (i.e., the response variable) with NA, but I'm getting this error message:
Error in all$sims.array[, , "deviance", drop = FALSE] :
subscript out of bounds
In addition: Warning message:
In FUN(X[[i]], ...) : Failed to set trace monitor for deviance
There are no observed stochastic nodes
It's the first time making predictions from JAGS models, so any help would be greatly appreciated.
REPRODUCIBLE CODE
library(geodata)
library(R2jags)
raw_data <- data.frame(T = c(-24, -22, -20, -19, -18, -16, -15, -12, -10 , -5, -2, -1, seq(1, 34, by = 1), 35, 37, 39, 41, 42, 43, 45, 46, 47),
dev = c(0, 0, 0, 0, 0, 0, 0, 0.06, 0.6, 0.9, 0.9, 1, rep(1, length(seq(1, 34, by = 1))), c(0.8, 0.8, 0.4, 0, 0, 0, 0, 0, 0)))
## plot(raw_data$T, raw_data$dev)
## Define a JAGS model
mod <- function(){
## Priors
q ~ dunif(0, 1)
T0 ~ dunif(-24, -2)
Tn ~ dunif(35, 47)
sigma ~ dunif(0, 1000)
tau <- 1 / (sigma * sigma)
## Likelihood
for(i in 1:n){
mu[i] <- -1 * q * (T[i] - T0) * (T[i] - Tn) * (Tn > T[i]) * (T0 < T[i]) * (-1 * q * (T[i] - T0) * (T[i] - Tn) < 1) + (-1 * q * (T[i] - T0) * (T[i] - Tn) > 1)
y[i] ~ dnorm(mu[i], tau)
}
}
## Group the data
data <- list(y = raw_data$dev, n = dim(raw_data)[1], T = raw_data$T)
## Define the parameters of interest to be estimated
param <- c("q", "T0", "Tn", "sigma", "mu")
## Define the initial values for the parameters of interest to be estimated
inits <- function(){list(q = 0.01, T0 = -20, Tn = 40, sigma = rlnorm(1))}
## Run the JAGS model
jags_mod <- R2jags::jags(data = data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)
FIRST ATTEMPT:
## Import the raster layer
bio_r <- worldclim_country(country = "France", var="bio", res = 10, path = tempdir())
## plot(bio_r$wc2.1_30s_bio_1)
## Convert the raster layer to a data fame
T_df <- terra::as.data.frame(bio_r$wc2.1_30s_bio_1, xy = TRUE, na.rm = TRUE)
## summary(T_df)
colnames(T_df) <- c("x", "y", "T")
## Apply the quadratic function to the raster layer
q <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "q", "mean"]
T0 <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "T0", "mean"]
Tn <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "Tn", "mean"]
T_df$dev <- -1 * q * (T_df$T - T0) * (T_df$T - Tn) * (Tn > T_df$T) * (T0 < T_df$T) * (-1 * q * (T_df$T - T0) * (T_df$T - Tn) < 1) + (-1 * q * (T_df$T - T0) * (T_df$T - Tn) > 1)
## summary(T_df)
plot(T_df$T, T_df$dev, pch = 16)
SECOND ATTEMPT:
T_df$dev <- NA
new_data <- list(y = T_df$dev, n = dim(T_df)[1], T = T_df$T)
jags_mod <- R2jags::jags(data = new_data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)
It sounds like you want the posterior predictive distribution for some new points. Essentially, what you need to do is calculate mu
for each new point and then for each value of mu
, take a draw from the relevant normal distribution. You can then summarize those draws for each new value of the independent variable (T
) in your case. Here's how I would do it. First, we'll load your data and run the model.
library(geodata)
library(R2jags)
library(ggplot2)
raw_data <- data.frame(T = c(-24, -22, -20, -19, -18, -16, -15, -12, -10 , -5, -2, -1, seq(1, 34, by = 1), 35, 37, 39, 41, 42, 43, 45, 46, 47),
dev = c(0, 0, 0, 0, 0, 0, 0, 0.06, 0.6, 0.9, 0.9, 1, rep(1, length(seq(1, 34, by = 1))), c(0.8, 0.8, 0.4, 0, 0, 0, 0, 0, 0)))
## plot(raw_data$T, raw_data$dev)
## Define a JAGS model
mod <- function(){
## Priors
q ~ dunif(0, 1)
T0 ~ dunif(-24, -2)
Tn ~ dunif(35, 47)
sigma ~ dunif(0, 1000)
tau <- 1 / (sigma * sigma)
## Likelihood
for(i in 1:n){
mu[i] <- -1 * q * (T[i] - T0) * (T[i] - Tn) * (Tn > T[i]) * (T0 < T[i]) * (-1 * q * (T[i] - T0) * (T[i] - Tn) < 1) + (-1 * q * (T[i] - T0) * (T[i] - Tn) > 1)
y[i] ~ dnorm(mu[i], tau)
}
}
## Group the data
data <- list(y = raw_data$dev, n = dim(raw_data)[1], T = raw_data$T)
## Define the parameters of interest to be estimated
param <- c("q", "T0", "Tn", "sigma", "mu")
## Define the initial values for the parameters of interest to be estimated
inits <- function(){list(q = 0.01, T0 = -20, Tn = 40, sigma = rlnorm(1))}
## Run the JAGS model
jags_mod <- R2jags::jags(data = data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)
#> module glm loaded
#> Compiling model graph
#> Resolving undeclared variables
#> Allocating nodes
#> Graph information:
#> Observed stochastic nodes: 55
#> Unobserved stochastic nodes: 4
#> Total graph size: 622
#>
#> Initializing model
Now, get the new data.
bio_r <- worldclim_country(country = "France", var="bio", res = 10, path = tempdir())
## Convert the raster layer to a data fame
T_df <- terra::as.data.frame(bio_r$wc2.1_30s_bio_1, xy = TRUE, na.rm = TRUE)
colnames(T_df) <- c("x", "y", "T")
T_df$dev <- NA
new_data <- list(y = T_df$dev, n = dim(T_df)[1], T = T_df$T)
One thing that will make this computationally intensive is that there are more than 1.6M points in your new data. However, there are only around 11k unique values of T
. As such, we could just use the unique values and then if you needed to have them correspond with the original 1.6M values, you could just merge them back in on T
.
new_data_small <- list(T = sort(unique(new_data$T)))
new_data_small$n <- length(new_data_small$T)
Take the draws from the parameters from the model object and turn them into a data frame.
pars <- as.mcmc(jags_mod)
pars <- do.call(rbind, pars)
pars <- as.data.frame(pars)
Next, make each row of pars
a different element of a list we'll call sp
.
sp <- split(pars[,c("q", "T0", "Tn")], 1:nrow(pars))
Now, we can make post_mu
- the posterior values of mu
for the new_data.
post_mu <- sapply(sp, \(x)with(new_data_small, -1 * x$q * (T - x$T0) * (T - x$Tn) * (x$Tn > T) * (x$T0 < T) * (-1 * x$q * (T - x$T0) * (T - x$Tn) < 1) + (-1 * x$q * (T - x$T0) * (T - x$Tn) > 1)))
Next, we use the posterior values of mu
and sigma
to draw from a normal distribution for each posterior draw.
post_obs <- sapply(1:ncol(post_mu), \(i)rnorm(nrow(post_mu), post_mu[,i], pars$sigma[i]))
Next, we summarize the posterior means and posterior predictive draws to get the 95% credible intervals.
post_mu_sum <- apply(post_mu, 1, \(x)c(mean(x), median(x), quantile(x, .025), quantile(x, .975)))
post_mu_sum <- t(post_mu_sum)
colnames(post_mu_sum) <- c("mu_mean", "mu_median", "mu_lwr", "mu_upr")
ppd_sum <- apply(post_obs, 1, \(x)c(mean(x), median(x), quantile(x, .025), quantile(x, .975)))
ppd_sum <- t(ppd_sum)
colnames(ppd_sum) <- c("ppd_mean", "ppd_median", "ppd_lwr", "ppd_upr")
post_mu_sum <- as.data.frame(post_mu_sum)
post_mu_sum$T <- new_data_small$T
post_mu_sum <- cbind(post_mu_sum, ppd_sum)
Finally, you can plot the relationship between T
and the posterior predictive draws of the dependent variable.
ggplot(post_mu_sum, aes(x=T, y=ppd_mean, ymin=ppd_lwr, ymax=ppd_upr)) +
geom_ribbon(alpha=.25) +
geom_line() +
theme_classic()
Created on 2024-04-27 with reprex v2.0.2
On a side note, I would discourage you from using T
as a variable name. It is an abbreviation for the logical value TRUE
and as such could lead to confusion, especially if you're developing or sharing code with other people.