rsimulationrstanarm

Calculate fitted values from simulated posterior distribution for multiple factors in R


I am trying to simulate data from a liner mixed model in R using the sim() function from the arm package.

Here is a subset of my data.

> dput(data)
structure(list(id = c(989001026313185, 989001026313143, 989001026313517, 
989001004605135, 989001026313424, 989001006700707, 989001005924838, 
989001004605145, 989001006700794, 989001026313151, 989001006700633, 
989001005924953, 989001005924862, 989001026313498, 989001026313255, 
989001026313461, 989001026313160, 989001026313256, 989001026313322, 
989001026313172, 989001026313388, 989001026313378, 989001026313357, 
989001005924856, 989001026313343, 989001026313417, 989001026313369, 
989001026313152, 985121031803859, 989001026313369, 989001006700560, 
989001026313370, 989001006700537, 989001026313424, 989001026313463, 
989001030278434), sex = c("F", "F", "F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F", "F", "M", "M", "M", "M", "M", "F", 
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M", "F", 
"F", "F", "M"), size = c(52, 75, 71.5, 90, 80.6, 67.6, 86.5, 
91.6, 103.1, 77, 70.8, 103.2, 88, 85.2, 50, 51.2, 45.4, 46, 50, 
95.6, 85.8, 82, 87.4, 74.6, 74, 93.9, 91, 92.5, 88.3, 92, 92.6, 
54.2, 60, 80.2, 94, 51.9), loc = c("loc1", "loc1", "loc1", "loc1", 
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", 
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc2", 
"loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", 
"loc2", "loc2", "loc2", "loc3", "loc3", "loc3", "loc3", "loc3"
), output = c(-12.2, -13.9, -14.1, -13.1, -11.7, -11.6, -11.6, 
-12.6, -12.6, -13.3, -13.3, -13, -13.4, -13.9, -13.8, -11.9, 
-13.5, -11.5, -10.7, -15.1, -11.7, -13.9, -12.9, -14.6, -14.3, 
-14.9, -13.8, -13.7, -11.8, -14.1, -12.2, -12.5, -12.2, -11.6, 
-11.9, -11)), class = "data.frame", row.names = c(NA, -36L))

I ran a LMM to understand the influence of location (factor, 3 levels: loc1, loc2, loc3), sex (factor, 2 levels: M and F) and size (continuous in cm) on the output:

mod <- lme4::lmer(output ~ sex + size + location + (1|id)

I then drew 1000 random values from the joint posterior ditribution of the model parameters.

nsim <- 1000
bsim_mod <- sim(mod, n.sim = nsim) 

I would like to plot the results now by visualising the credible interval of the bsim_mod object and the observations.

While I can do that for the different size range in females (52 to 104 cm) and males (45 to 55 cm) I am struggling on how to also include the location information in the code below.

length(seq(52,104,1)) # number sizes for females = 53
length(seq(45,55,1)) # number sizes for males = 11

## create sex, size, vectors
sex_vec <- factor(c(rep("F", 53), rep("M", 11)), levels = c("F","M"))
size_vec <- c(seq(52,104,1), seq(45,55,1))

## obtain 1000 simulated values from the posterior distribution for the size range in males and females
newdat <- data.frame(sex_vec,size_vec)
colnames(newdat) <- c("sex", "size")
Xmat <- model.matrix(~sex + size, data = newdat)
fitmat <- matrix(ncol = 1000, nrow = nrow(newdat))
for(i in 1:1000) fitmat[,i] <- Xmat %*% bsim_mod@fixef[i,]
newdat$lower <- apply(fitmat, 1, quantile, prob = 0.025)
newdat$postmean <- apply(fitmat, 1, mean)
newdat$upper <- apply(fitmat, 1, quantile, prob = 0.975)

## Now simulate for the mean size only
newdat_mean <- data.frame(sex = factor(c("F", "M"), levels = levels(data$sex)),
                               size = c(83, 50)
colnames(newdat_mean) <- c("sex", "size")
Xmat_mean <- model.matrix(~sex + size, data = newdat_mean)
fitmat_mean <- matrix(ncol = 1000, nrow = nrow(newdat_mean))

for(i in 1:1000) fitmat_mean[,i] <- Xmat_mean %*% bsim_mod@fixef[i,]
newdat_mean$lower <- apply(fitmat_mC_meanDW, 1, quantile, prob = 0.025)
newdat_mean$postmean <- apply(fitmat_mean, 1, mean)
newdat_mean$upper <- apply(fitmat_mean, 1, quantile, prob = 0.975)

The goal would be to simulate the values but also taking into account that the samples were taken in three different locations.


Solution

  • I would probably do it like this:

    library(lme4)
    library(arm)
    library(ggplot2)
    library(dplyr)
    
    mod <- lme4::lmer(output ~ sex + size + loc + (1|id), data=data)
    
    nsim <- 1000
    bsim_mod <- sim(mod, n.sim = nsim)                   
    

    To generate the data for the simulation you could group by sex and location and then create a sequence of sizes of the range of sizes in each location-sex combination. You can also save the mean size in each location for use later.

    newdat <- data %>% 
      group_by(sex, loc) %>% 
      reframe(mean_size = round(mean(size)), 
              size = seq(floor(min(size)), ceiling(max(size)), by=1))
    

    You can generate the predictions with a product of the model matrix and the simulated coefficients, we can do it without a loop.

    Xmat <- model.matrix(~sex + size + loc, data = newdat)
    fitmat <- Xmat %*% t(bsim_mod@fixef)
    

    You can summarize the posterior predictions and add them back into the newdat object.

    fitdat <- t(apply(fitmat, 1, \(x)c(mean(x), quantile(x, c(.025,.975)))))
    colnames(fitdat) <- c("postmean", "lower", "upper")
    newdat <- cbind(newdat, fitdat)
    

    If you need a data frame with just he mean sizes in each location-gender combination, you can now get it by filtering the object you just created, rather than redoing everything.

    newdat_mean <- newdat %>%
      group_by(sex, loc) %>% 
      filter(mean_size == size)
    

    Then, you could make the graph.

    ggplot(newdat, aes(x=size, y=postmean, colour=sex, fill=sex)) + 
      geom_ribbon(aes(ymin=lower, ymax=upper), colour="transparent", alpha=.15) + 
      geom_line() + 
      facet_wrap(~loc) + 
      theme_bw()
    

    Figure whth predicted values for sex and size by location

    data <- structure(list(id = c(989001026313185, 989001026313143, 989001026313517, 
    989001004605135, 989001026313424, 989001006700707, 989001005924838, 
    989001004605145, 989001006700794, 989001026313151, 989001006700633, 
    989001005924953, 989001005924862, 989001026313498, 989001026313255, 
    989001026313461, 989001026313160, 989001026313256, 989001026313322, 
    989001026313172, 989001026313388, 989001026313378, 989001026313357, 
    989001005924856, 989001026313343, 989001026313417, 989001026313369, 
    989001026313152, 985121031803859, 989001026313369, 989001006700560, 
    989001026313370, 989001006700537, 989001026313424, 989001026313463, 
    989001030278434), sex = c("F", "F", "F", "F", "F", "F", "F", 
    "F", "F", "F", "F", "F", "F", "F", "M", "M", "M", "M", "M", "F", 
    "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M", "F", 
    "F", "F", "M"), size = c(52, 75, 71.5, 90, 80.6, 67.6, 86.5, 
    91.6, 103.1, 77, 70.8, 103.2, 88, 85.2, 50, 51.2, 45.4, 46, 50, 
    95.6, 85.8, 82, 87.4, 74.6, 74, 93.9, 91, 92.5, 88.3, 92, 92.6, 
    54.2, 60, 80.2, 94, 51.9), loc = c("loc1", "loc1", "loc1", "loc1", 
    "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", 
    "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc2", 
    "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", 
    "loc2", "loc2", "loc2", "loc3", "loc3", "loc3", "loc3", "loc3"
    ), output = c(-12.2, -13.9, -14.1, -13.1, -11.7, -11.6, -11.6, 
    -12.6, -12.6, -13.3, -13.3, -13, -13.4, -13.9, -13.8, -11.9, 
    -13.5, -11.5, -10.7, -15.1, -11.7, -13.9, -12.9, -14.6, -14.3, 
    -14.9, -13.8, -13.7, -11.8, -14.1, -12.2, -12.5, -12.2, -11.6, 
    -11.9, -11)), class = "data.frame", row.names = c(NA, -36L))
    

    Created on 2025-03-25 with reprex v2.1.1.9000