Very odd plots from posterior simulation of factor by GAM model

I am running the model below (modGI_test):

  modGI_test <- gam(Met1 ~ Antibiotics + s(Timepoint, k=7) +
                s(Timepoint, by=Antibiotics, k=7, m=1) + 
                s(Timepoint, ID_new, bs="fs", k=7) +
                Age_use + Gender, 
                data=df_full, method="REML")

And getting the posterior simulation information from:

 beta <- coef(modGI_test)  
 Vb <- vcov(modGI_test) 
 Xp <-predict(modGI_test, type= "lpmatrix") 
 Beta_sim <-rmvn(n=1000, beta, Vb)

For metabolite (response), I would like to obtain separate plots for the global s(Timepoint), per individual s(Timepoint, ID_new, bs="fs") and also for the Antibiotics (Yes/No) s(Timepoint, by=Antibiotics). However, when I extract out the information for antibiotics yes and antibiotics no using the below and plot the mean and CI, the plots look very odd - where am I going wrong here please?

##antibiotics yes 

Xp_antibioticsyes <- Xp[, grep("s\\(Timepoint\\):Antibiotics(Yes)\\.\\d+", colnames(Xp))]
Beta_antibioticsyes <- Beta_sim[, grep("s\\(Timepoint\\):Antibiotics(Yes)\\.\\d+", colnames(Beta_sim))]
Eta_antibioticsyes <- Xp_antibioticsyes %*% t(Beta_antibioticsyes)
Mu_antibioticsyes <- inv_link(modGI_test)(Eta_antibioticsyes)

##antibiotics no 

Xp_antibioticsno <- Xp[, antibiotics_cols_no]
Beta_antibioticsno <- Beta_sim[,  grep("s\\(Timepoint\\):Antibiotics(No)\\.\\d+", colnames(Beta_sim))]
Eta_antibioticsno <- Xp_antibioticsno %*% t(Beta_antibioticsno)
Mu_antibioticsno <- inv_link(modGI_test)(Eta_antibioticsno)

global_intercept <- Xp[, "(Intercept)"] %*% t(Beta_sim[, "(Intercept)"])
main_effect <- Xp[, "AntibioticsYes"] %*% t(Beta_sim[, "AntibioticsYes"])

# Add these to the group-specific predictions

Mu_antibioticsyes <- Mu_antibioticsyes + global_intercept + main_effect
Mu_antibioticsno <- Mu_antibioticsno + global_intercept  # No main effect for "No"

##showing only antibiotics yes here
antibiotics_yes <- data.frame(t(apply(Mu_antibioticsyes, 1, function(x) {
  c(mean = mean(x, na.rm = TRUE),
  lowerci = quantile(x, probs = 0.025),
  upperci = quantile(x, probs = 0.975))
})))

antibiotics_yes$Timepoint <- stackoverflow$Timepoint
antibiotics_yes$Group <- rep("Yes", nrow(antibiotics_yes))

 ##doing the same for antibiotics no, rbinding them , then running ggplot with geom_ribbon

EDIT: dput(df_full)

structure(list(ID_new = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
  2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 
  4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
  6L, 6L, 6L), levels = c("1", "3", "4", "5", "6", "7"), class = "factor"), 
  Age_use = c(61L, 61L, 61L, 61L, 61L, 61L, 62L, 51L, 51L, 
  51L, 51L, 51L, 51L, 52L, 30L, 30L, 30L, 30L, 30L, 30L, 31L, 
  50L, 50L, 50L, 50L, 50L, 50L, 51L, 43L, 43L, 43L, 43L, 43L, 
  43L, 44L, 30L, 30L, 30L, 30L, 30L, 30L, 31L), Gender = structure(c(2L, 
  2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
  2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
  2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("Female", 
  "Male"), class = "factor"), Antibiotics = structure(c(1L, 
  1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
  1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
  2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), levels = c("No", 
  "Yes"), class = "factor"), Timepoint = c(1L, 2L, 3L, 4L, 
  5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 
  6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 
  7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L), Met1 = c(0.610312217, -0.892993391, 
  -0.193675438, 0.725636548, 1.028557245, 0.658419815, -0.608733906, 
  0.788168303, 0.031552182, -0.007267893, -0.612162917, -0.016844181, 
  0.773175345, -1.07871784, -0.255036005, 0.282531559, -0.218753849, 
 -0.823197508, 0.415051062, 1.248028647, 0.761003283, -0.413166119, 
  0.384806119, -0.00364492, 0.559821831, 0.091177022, -0.347510957, 
  0.509460406, -0.132033165, 0.479567493, -0.019536566, -0.190650928, 
  0.331425516, 0.691209456, 0.422804123, 0.024092717, -0.294300556, 
  -1.088126749, -1.879112138, 0.277489963, -0.73213862, 1.035703532
  )), row.names = c(NA, -42L), class = "data.frame")

Solution

For the data you added, I see:

So nothing much going on for Timepoint, whether globally of per Antibiotics.

I think the problems are coming from the fact that you are using the observed data and that might be causing problems. Instead, generate data evenly over the terms you want to plot. One further problem with the approach you took is that the intercept also include the reference level for Gender. Instead, I chose to generate posterior fitted samples (fitted values but showing the uncertainty in the model estimates), excluding the random smooths and the linear effect of Age_use, but including the average smooth of Timepoint. You can think of these results as showing the the estimated effect of Timepoint conditional upon Antibiotics and Gender, excluding the effects of Age_use and the subject_specific Timepoint effects.

For the posterior samples then I see:

library("gratia")
library("ggdist")
library("ggplot2")
library("dplyr")

## generate data to predict at
ds <- data_slice(
  m,
  Timepoint = evenly(Timepoint),
  Antibiotics = evenly(Antibiotics),
  Gender = evenly(Gender)
) |>
  mutate(
    .row = row_number()
  )

## compute the posterior samples
fv  <- fitted_samples(
  m,
  data = ds,
  n = 10000,
  exclude = c("Age_use", "s(Timepoint,ID_new)")
)

## compute the 95% interval
fv_int <- fv |>
  group_by(.row) |>
  median_qi(
    .width = c(0.95),
    .exclude = c(".draw", ".parameter", ".row")
  ) |>
  left_join(
    ds |>
      select(.row, Antibiotics, Timepoint, Gender),
    by = join_by(.row)
  )

## plot
fv_int |>
  ggplot(aes(y = .fitted, ymin = .lower, ymax = .upper, x = Timepoint)) +
  geom_ribbon(aes(fill = Antibiotics), alpha = 0.2) +
  geom_line(aes(colour = Antibiotics)) +
  facet_wrap( ~ Gender)

This is not unexpected given the estimated effects of the two difference smooths are effectively flat functions (no difference).

I feel this approach is better than just doing it for the difference smooths (i.e. also excluding the average effect of Timepoint) because your factor by smooths really are difference smooths and not the average effect of Timepoint in the two groups of Antibiotic.

More generally, your model is better estimated using the new "sz" basis, which wasn't in mgcv when we wrote the HGAM paper. This would give:

m <- gam(Met1 ~ s(Timepoint, k = 7) +
  s(Timepoint, Antibiotics, k = 7, bs = "sz") + 
  s(Timepoint, ID_new, bs = "fs", k = 7) +
  Age_use + Gender, 
  data = df_full, method = "REML")

These estimated effects are

Repeating the code from above

## generate data to predict at
ds <- data_slice(
  m,
  Timepoint = evenly(Timepoint),
  Antibiotics = evenly(Antibiotics),
  Gender = evenly(Gender)
) |>
  mutate(
    .row = row_number()
  )

## compute the posterior samples
fv  <- fitted_samples(
  m,
  data = ds,
  n = 10000,
  exclude = c("Age_use", "s(Timepoint,ID_new)")
)

## compute the 95% interval
fv_int <- fv |>
  group_by(.row) |>
  median_qi(
    .width = c(0.95),
    .exclude = c(".draw", ".parameter", ".row")
  ) |>
  left_join(
    ds |>
      select(.row, Antibiotics, Timepoint, Gender),
    by = join_by(.row)
  )

## plot
fv_int |>
  ggplot(aes(y = .fitted, ymin = .lower, ymax = .upper, x = Timepoint)) +
  geom_ribbon(aes(fill = Antibiotics), alpha = 0.2) +
  geom_line(aes(colour = Antibiotics)) +
  facet_wrap( ~ Gender)

We get

This model is preferred as it the difference smooths use a basis that is designed to be orthogonal to the basis for the main effect of Timepoint. These again are difference smooths, so again it makes sense to generate model predictions at the covariate values we want in order to plot the effects of Antibiotics.