rrjags

Predicting new observations from a JAGS model (R2jags package) using new GIS-based data


I built a JAGS model using the R2jags package to fit a quadratic function to empirical data and estimate the parameters of the function. I would like to use this model to predict new observations from a new data frame coming from a raster layer. The problem is that I don't know how to do it.

First attempt: I tried to apply the code below from the 'First attempt' section, but it doesn't seem correct to me (see the figure below).

enter image description here

I was expecting there to be more noise in the data, meaning more fluctuations in the data.

Second attempt: I then reran the model from the 'Second attempt' section using a new data frame where I created a column y (i.e., the response variable) with NA, but I'm getting this error message:

Error in all$sims.array[, , "deviance", drop = FALSE] : 
  subscript out of bounds
In addition: Warning message:
In FUN(X[[i]], ...) : Failed to set trace monitor for deviance
There are no observed stochastic nodes

It's the first time making predictions from JAGS models, so any help would be greatly appreciated.

REPRODUCIBLE CODE

library(geodata)
library(R2jags)

raw_data <- data.frame(T = c(-24, -22, -20, -19, -18, -16, -15, -12, -10 , -5, -2, -1, seq(1, 34, by = 1), 35, 37, 39, 41, 42, 43, 45, 46, 47), 
                       dev = c(0, 0, 0, 0, 0, 0, 0, 0.06, 0.6, 0.9, 0.9, 1, rep(1, length(seq(1, 34, by = 1))), c(0.8, 0.8, 0.4, 0, 0, 0, 0, 0, 0)))
## plot(raw_data$T, raw_data$dev)

## Define a JAGS model
mod <- function(){
  
  ## Priors
  q ~ dunif(0, 1)
  T0 ~ dunif(-24, -2)
  Tn ~ dunif(35, 47)
  sigma ~ dunif(0, 1000)
  tau <- 1 / (sigma * sigma)
  
  ## Likelihood
  for(i in 1:n){
    mu[i] <- -1 * q * (T[i] - T0) * (T[i] - Tn) * (Tn > T[i]) * (T0 < T[i]) * (-1 * q * (T[i] - T0) * (T[i] - Tn) < 1) + (-1 * q * (T[i] - T0) * (T[i] - Tn) > 1)
    y[i] ~ dnorm(mu[i], tau)
  }
  
}

## Group the data
data <- list(y = raw_data$dev, n = dim(raw_data)[1], T = raw_data$T)

## Define the parameters of interest to be estimated
param <- c("q", "T0", "Tn", "sigma", "mu")

## Define the initial values for the parameters of interest to be estimated
inits <- function(){list(q = 0.01, T0 = -20, Tn = 40, sigma = rlnorm(1))}

## Run the JAGS model
jags_mod <- R2jags::jags(data = data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)

FIRST ATTEMPT:

## Import the raster layer
bio_r <- worldclim_country(country = "France", var="bio", res = 10, path = tempdir())
## plot(bio_r$wc2.1_30s_bio_1)

## Convert the raster layer to a data fame
T_df <- terra::as.data.frame(bio_r$wc2.1_30s_bio_1, xy = TRUE, na.rm = TRUE)
## summary(T_df)
colnames(T_df) <- c("x", "y", "T")

## Apply the quadratic function to the raster layer
q <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "q", "mean"]
T0 <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "T0", "mean"]
Tn <- jags_mod$BUGSoutput$summary[row.names(jags_mod$BUGSoutput$summary) %in% "Tn", "mean"]
T_df$dev <- -1 * q * (T_df$T - T0) * (T_df$T - Tn) * (Tn > T_df$T) * (T0 < T_df$T) * (-1 * q * (T_df$T - T0) * (T_df$T - Tn) < 1) + (-1 * q * (T_df$T - T0) * (T_df$T - Tn) > 1)
## summary(T_df)
plot(T_df$T, T_df$dev, pch = 16)

SECOND ATTEMPT:

T_df$dev <- NA
new_data <- list(y = T_df$dev, n = dim(T_df)[1], T = T_df$T)
jags_mod <- R2jags::jags(data = new_data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)

Solution

  • It sounds like you want the posterior predictive distribution for some new points. Essentially, what you need to do is calculate mu for each new point and then for each value of mu, take a draw from the relevant normal distribution. You can then summarize those draws for each new value of the independent variable (T) in your case. Here's how I would do it. First, we'll load your data and run the model.

    library(geodata)
    library(R2jags)
    library(ggplot2)
    raw_data <- data.frame(T = c(-24, -22, -20, -19, -18, -16, -15, -12, -10 , -5, -2, -1, seq(1, 34, by = 1), 35, 37, 39, 41, 42, 43, 45, 46, 47), 
                           dev = c(0, 0, 0, 0, 0, 0, 0, 0.06, 0.6, 0.9, 0.9, 1, rep(1, length(seq(1, 34, by = 1))), c(0.8, 0.8, 0.4, 0, 0, 0, 0, 0, 0)))
    ## plot(raw_data$T, raw_data$dev)
    
    ## Define a JAGS model
    mod <- function(){
      
      ## Priors
      q ~ dunif(0, 1)
      T0 ~ dunif(-24, -2)
      Tn ~ dunif(35, 47)
      sigma ~ dunif(0, 1000)
      tau <- 1 / (sigma * sigma)
      
      ## Likelihood
      for(i in 1:n){
        mu[i] <- -1 * q * (T[i] - T0) * (T[i] - Tn) * (Tn > T[i]) * (T0 < T[i]) * (-1 * q * (T[i] - T0) * (T[i] - Tn) < 1) + (-1 * q * (T[i] - T0) * (T[i] - Tn) > 1)
        y[i] ~ dnorm(mu[i], tau)
      }
      
    }
    
    ## Group the data
    data <- list(y = raw_data$dev, n = dim(raw_data)[1], T = raw_data$T)
    
    ## Define the parameters of interest to be estimated
    param <- c("q", "T0", "Tn", "sigma", "mu")
    
    ## Define the initial values for the parameters of interest to be estimated
    inits <- function(){list(q = 0.01, T0 = -20, Tn = 40, sigma = rlnorm(1))}
    
    ## Run the JAGS model
    jags_mod <- R2jags::jags(data = data, inits = inits, parameters.to.save = param, n.thin = 8, n.chains = 3, n.burnin = 5000, n.iter = 25000, model.file = mod)
    #> module glm loaded
    #> Compiling model graph
    #>    Resolving undeclared variables
    #>    Allocating nodes
    #> Graph information:
    #>    Observed stochastic nodes: 55
    #>    Unobserved stochastic nodes: 4
    #>    Total graph size: 622
    #> 
    #> Initializing model
    

    Now, get the new data.

    bio_r <- worldclim_country(country = "France", var="bio", res = 10, path = tempdir())
    
    ## Convert the raster layer to a data fame
    T_df <- terra::as.data.frame(bio_r$wc2.1_30s_bio_1, xy = TRUE, na.rm = TRUE)
    colnames(T_df) <- c("x", "y", "T")
    
    T_df$dev <- NA
    new_data <- list(y = T_df$dev, n = dim(T_df)[1], T = T_df$T)
    

    One thing that will make this computationally intensive is that there are more than 1.6M points in your new data. However, there are only around 11k unique values of T. As such, we could just use the unique values and then if you needed to have them correspond with the original 1.6M values, you could just merge them back in on T.

    new_data_small <- list(T = sort(unique(new_data$T)))
    new_data_small$n <- length(new_data_small$T)
    

    Take the draws from the parameters from the model object and turn them into a data frame.

    pars <- as.mcmc(jags_mod)
    pars <- do.call(rbind, pars)
    pars <- as.data.frame(pars)
    

    Next, make each row of pars a different element of a list we'll call sp.

    sp <- split(pars[,c("q", "T0", "Tn")], 1:nrow(pars))
    

    Now, we can make post_mu - the posterior values of mu for the new_data.

    post_mu <- sapply(sp, \(x)with(new_data_small, -1 * x$q * (T - x$T0) * (T - x$Tn) * (x$Tn > T) * (x$T0 < T) * (-1 * x$q * (T - x$T0) * (T - x$Tn) < 1) + (-1 * x$q * (T - x$T0) * (T - x$Tn) > 1)))
    

    Next, we use the posterior values of mu and sigma to draw from a normal distribution for each posterior draw.

    post_obs <- sapply(1:ncol(post_mu), \(i)rnorm(nrow(post_mu), post_mu[,i], pars$sigma[i]))
    

    Next, we summarize the posterior means and posterior predictive draws to get the 95% credible intervals.

    post_mu_sum <- apply(post_mu, 1, \(x)c(mean(x), median(x), quantile(x, .025), quantile(x, .975)))
    post_mu_sum <- t(post_mu_sum)
    colnames(post_mu_sum) <- c("mu_mean", "mu_median", "mu_lwr", "mu_upr")
    ppd_sum <- apply(post_obs, 1, \(x)c(mean(x), median(x), quantile(x, .025), quantile(x, .975)))
    ppd_sum <- t(ppd_sum)
    colnames(ppd_sum) <- c("ppd_mean", "ppd_median", "ppd_lwr", "ppd_upr")
    post_mu_sum <- as.data.frame(post_mu_sum)
    post_mu_sum$T <- new_data_small$T
    post_mu_sum <- cbind(post_mu_sum, ppd_sum)
    

    Finally, you can plot the relationship between T and the posterior predictive draws of the dependent variable.

    ggplot(post_mu_sum, aes(x=T, y=ppd_mean, ymin=ppd_lwr, ymax=ppd_upr)) + 
      geom_ribbon(alpha=.25) + 
      geom_line() + 
      theme_classic()
    

    Created on 2024-04-27 with reprex v2.0.2

    On a side note, I would discourage you from using T as a variable name. It is an abbreviation for the logical value TRUE and as such could lead to confusion, especially if you're developing or sharing code with other people.