I am trying to simulate data from a liner mixed model in R using the sim()
function from the arm
package.
Here is a subset of my data.
> dput(data)
structure(list(id = c(989001026313185, 989001026313143, 989001026313517,
989001004605135, 989001026313424, 989001006700707, 989001005924838,
989001004605145, 989001006700794, 989001026313151, 989001006700633,
989001005924953, 989001005924862, 989001026313498, 989001026313255,
989001026313461, 989001026313160, 989001026313256, 989001026313322,
989001026313172, 989001026313388, 989001026313378, 989001026313357,
989001005924856, 989001026313343, 989001026313417, 989001026313369,
989001026313152, 985121031803859, 989001026313369, 989001006700560,
989001026313370, 989001006700537, 989001026313424, 989001026313463,
989001030278434), sex = c("F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "M", "M", "M", "M", "M", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M", "F",
"F", "F", "M"), size = c(52, 75, 71.5, 90, 80.6, 67.6, 86.5,
91.6, 103.1, 77, 70.8, 103.2, 88, 85.2, 50, 51.2, 45.4, 46, 50,
95.6, 85.8, 82, 87.4, 74.6, 74, 93.9, 91, 92.5, 88.3, 92, 92.6,
54.2, 60, 80.2, 94, 51.9), loc = c("loc1", "loc1", "loc1", "loc1",
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1",
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc2",
"loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2",
"loc2", "loc2", "loc2", "loc3", "loc3", "loc3", "loc3", "loc3"
), output = c(-12.2, -13.9, -14.1, -13.1, -11.7, -11.6, -11.6,
-12.6, -12.6, -13.3, -13.3, -13, -13.4, -13.9, -13.8, -11.9,
-13.5, -11.5, -10.7, -15.1, -11.7, -13.9, -12.9, -14.6, -14.3,
-14.9, -13.8, -13.7, -11.8, -14.1, -12.2, -12.5, -12.2, -11.6,
-11.9, -11)), class = "data.frame", row.names = c(NA, -36L))
I ran a LMM to understand the influence of location (factor, 3 levels: loc1, loc2, loc3), sex (factor, 2 levels: M and F) and size (continuous in cm) on the output:
mod <- lme4::lmer(output ~ sex + size + location + (1|id)
I then drew 1000 random values from the joint posterior ditribution of the model parameters.
nsim <- 1000
bsim_mod <- sim(mod, n.sim = nsim)
I would like to plot the results now by visualising the credible interval of the bsim_mod
object and the observations.
While I can do that for the different size range in females (52 to 104 cm) and males (45 to 55 cm) I am struggling on how to also include the location information in the code below.
length(seq(52,104,1)) # number sizes for females = 53
length(seq(45,55,1)) # number sizes for males = 11
## create sex, size, vectors
sex_vec <- factor(c(rep("F", 53), rep("M", 11)), levels = c("F","M"))
size_vec <- c(seq(52,104,1), seq(45,55,1))
## obtain 1000 simulated values from the posterior distribution for the size range in males and females
newdat <- data.frame(sex_vec,size_vec)
colnames(newdat) <- c("sex", "size")
Xmat <- model.matrix(~sex + size, data = newdat)
fitmat <- matrix(ncol = 1000, nrow = nrow(newdat))
for(i in 1:1000) fitmat[,i] <- Xmat %*% bsim_mod@fixef[i,]
newdat$lower <- apply(fitmat, 1, quantile, prob = 0.025)
newdat$postmean <- apply(fitmat, 1, mean)
newdat$upper <- apply(fitmat, 1, quantile, prob = 0.975)
## Now simulate for the mean size only
newdat_mean <- data.frame(sex = factor(c("F", "M"), levels = levels(data$sex)),
size = c(83, 50)
colnames(newdat_mean) <- c("sex", "size")
Xmat_mean <- model.matrix(~sex + size, data = newdat_mean)
fitmat_mean <- matrix(ncol = 1000, nrow = nrow(newdat_mean))
for(i in 1:1000) fitmat_mean[,i] <- Xmat_mean %*% bsim_mod@fixef[i,]
newdat_mean$lower <- apply(fitmat_mC_meanDW, 1, quantile, prob = 0.025)
newdat_mean$postmean <- apply(fitmat_mean, 1, mean)
newdat_mean$upper <- apply(fitmat_mean, 1, quantile, prob = 0.975)
The goal would be to simulate the values but also taking into account that the samples were taken in three different locations.
I would probably do it like this:
library(lme4)
library(arm)
library(ggplot2)
library(dplyr)
mod <- lme4::lmer(output ~ sex + size + loc + (1|id), data=data)
nsim <- 1000
bsim_mod <- sim(mod, n.sim = nsim)
To generate the data for the simulation you could group by sex and location and then create a sequence of sizes of the range of sizes in each location-sex combination. You can also save the mean size in each location for use later.
newdat <- data %>%
group_by(sex, loc) %>%
reframe(mean_size = round(mean(size)),
size = seq(floor(min(size)), ceiling(max(size)), by=1))
You can generate the predictions with a product of the model matrix and the simulated coefficients, we can do it without a loop.
Xmat <- model.matrix(~sex + size + loc, data = newdat)
fitmat <- Xmat %*% t(bsim_mod@fixef)
You can summarize the posterior predictions and add them back into the newdat
object.
fitdat <- t(apply(fitmat, 1, \(x)c(mean(x), quantile(x, c(.025,.975)))))
colnames(fitdat) <- c("postmean", "lower", "upper")
newdat <- cbind(newdat, fitdat)
If you need a data frame with just he mean sizes in each location-gender combination, you can now get it by filtering the object you just created, rather than redoing everything.
newdat_mean <- newdat %>%
group_by(sex, loc) %>%
filter(mean_size == size)
Then, you could make the graph.
ggplot(newdat, aes(x=size, y=postmean, colour=sex, fill=sex)) +
geom_ribbon(aes(ymin=lower, ymax=upper), colour="transparent", alpha=.15) +
geom_line() +
facet_wrap(~loc) +
theme_bw()
data <- structure(list(id = c(989001026313185, 989001026313143, 989001026313517,
989001004605135, 989001026313424, 989001006700707, 989001005924838,
989001004605145, 989001006700794, 989001026313151, 989001006700633,
989001005924953, 989001005924862, 989001026313498, 989001026313255,
989001026313461, 989001026313160, 989001026313256, 989001026313322,
989001026313172, 989001026313388, 989001026313378, 989001026313357,
989001005924856, 989001026313343, 989001026313417, 989001026313369,
989001026313152, 985121031803859, 989001026313369, 989001006700560,
989001026313370, 989001006700537, 989001026313424, 989001026313463,
989001030278434), sex = c("F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "M", "M", "M", "M", "M", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "M", "F",
"F", "F", "M"), size = c(52, 75, 71.5, 90, 80.6, 67.6, 86.5,
91.6, 103.1, 77, 70.8, 103.2, 88, 85.2, 50, 51.2, 45.4, 46, 50,
95.6, 85.8, 82, 87.4, 74.6, 74, 93.9, 91, 92.5, 88.3, 92, 92.6,
54.2, 60, 80.2, 94, 51.9), loc = c("loc1", "loc1", "loc1", "loc1",
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1",
"loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc1", "loc2",
"loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2", "loc2",
"loc2", "loc2", "loc2", "loc3", "loc3", "loc3", "loc3", "loc3"
), output = c(-12.2, -13.9, -14.1, -13.1, -11.7, -11.6, -11.6,
-12.6, -12.6, -13.3, -13.3, -13, -13.4, -13.9, -13.8, -11.9,
-13.5, -11.5, -10.7, -15.1, -11.7, -13.9, -12.9, -14.6, -14.3,
-14.9, -13.8, -13.7, -11.8, -14.1, -12.2, -12.5, -12.2, -11.6,
-11.9, -11)), class = "data.frame", row.names = c(NA, -36L))
Created on 2025-03-25 with reprex v2.1.1.9000