From my GAMM, I've plotted my parametric interaction term on the response scale, but I'm getting these jagged/spiked predictions in my plots, particularly in the green line [A.]. I'd like to see something more like the second plot [B], where each plot seems a lot smoother. Is this something related to my model, predict(), not enough data, or ggplot, and how can I fix it - if possible? My data is too large to share. Ignore the labels, I'm just copying from a tutorial. I have tried increasing the sample size in "MyData" to 10, and 100, but the problem remains.
A.
B.
library(plyr)
library(gratia)
library(ggplot2)
m4 <- bam(empty ~ SL*fZone +
s(fStation, bs = "re"),
data = c_neb4,
method = 'fREML',
discrete = TRUE,
family = binomial(link = "logit"),
select = TRUE,
gamma = 1.5)
MyData <- ddply(c_neb4,
.(SL, fZone, fStation),
summarize,
SL = evenly(SL, n = 1))
#' Specify a value for the random effect Plot.
MyData$fPlot <- "NA"
#' Do the actual prediction. Ignore the warning.
P2 <- predict(m4,
newdata = MyData,
se = TRUE,
newdata.guaranteed = TRUE,
exclude = "s(fPlot)" )
#' Convert the predicted values to the scale of the observed data.
MyData$eta <- P2$fit
MyData$Pi <- exp(MyData$eta) / (1 + exp(MyData$eta))
#' Calculate the SEs on the scale of the observed data.
MyData$se <- P2$se.fit
MyData$SeUp <- exp(MyData$eta + 1.96 *MyData$se) /
(1 + exp(MyData$eta + 1.96 *MyData$se))
MyData$SeLo <- exp(MyData$eta - 1.96 *MyData$se) /
(1 + exp(MyData$eta - 1.96 *MyData$se))
> head(MyData)
fZone fStation SL fPlot eta Pi se SeUp SeLo
1 West 146 9.28 NA -0.265285021 0.4340650 0.3192834 0.5891640 0.2908869
2 West 123 10.22 NA 0.014991421 0.5037478 0.3273342 0.6584887 0.3482856
3 Rankin 240 11.47 NA 0.079175289 0.5197835 0.4404169 0.7195826 0.3134495
4 West 70 13.26 NA 0.003325549 0.5008314 0.2679914 0.6291539 0.3723993
5 West 168 13.64 NA -0.308523994 0.4234751 0.2652243 0.5526332 0.3039912
6 West 70 14.43 NA -0.043266409 0.4891851 0.2606781 0.6148304 0.3648916
p <- ggplot()
p <- p + geom_point(data = c_neb4,
aes(y = empty, x = SL),
shape = 1,
size = 1)
p <- p + theme(text = element_text(size=15)) + theme_bw()
p <- p + geom_line(data = MyData,
aes(x = SL,
y = Pi,
group = fZone,
colour = fZone))
p <- p + geom_ribbon(data = MyData,
aes(x = SL,
ymax = SeUp,
ymin = SeLo,
group = fZone,
fill = fZone),
alpha = 0.5)
p + facet_wrap(. ~ fCYR , scales = "fixed", ncol = 2)
p
UPDATE: Random samples from data
> sample <- MyData[sample(nrow(MyData), 15), ]
> dput(sample)
structure(list(fZone = structure(c(3L, 3L, 3L, 3L, 2L, 2L, 4L,
3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L), levels = c("Crocodile", "Rankin",
"West", "Whipray"), class = "factor"), fStation = structure(c(34L,
3L, 41L, 3L, 69L, 51L, 67L, 10L, 11L, 43L, 42L, 41L, 69L, 17L,
7L), levels = c("20", "21", "22", "23", "24", "40", "54", "65",
"67", "68", "70", "71", "73", "101", "105", "106", "107", "111",
"112", "117", "118", "119", "122", "123", "124", "130", "133",
"134", "135", "137", "143", "144", "145", "146", "147", "156",
"157", "158", "159", "167", "168", "169", "171", "172", "173",
"174", "175", "176", "224", "225", "226", "227", "229", "237",
"239", "240", "241", "253", "254", "255", "256", "257", "265",
"266", "267", "268", "269", "270", "278", "279", "280", "281",
"282", "284", "290", "291", "292", "294", "301", "302", "303",
"304", "312", "313", "315", "323", "609", "619", "621", "622"
), class = "factor"), SL = c(21.78, 43.7, 22.93, 33.04, 25.27,
36.44, 41.77, 38.74, 28.67, 43.66, 19.52, 24.88, 34.64, 42.31,
25.4), fPlot = c("NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA",
"NA", "NA", "NA", "NA", "NA", "NA", "NA"), eta = c(-0.763062357276118,
-1.42419453038225, -0.678472110266059, -0.999690017882061, -0.603965123351189,
-1.36501523925482, -2.37609726046486, -1.08785823464558, -0.610334350940889,
-1.37234525043639, -0.0494515487646545, -0.756125374747801, -1.1302701466249,
-1.33176524564419, -0.553651972030815), Pi = c(0.317981764865072,
0.194004855626976, 0.336602397020586, 0.269002371918509, 0.353437060849354,
0.203426403466547, 0.085013654265328, 0.252021801284264, 0.351982931722769,
0.202241200193719, 0.487639631600599, 0.319488079759726, 0.244111249748232,
0.208867522794921, 0.365017539353501), se = c(0.25144350163443,
0.273182637481648, 0.221088894440468, 0.25265965865359, 0.32514106228583,
0.321868887860789, 0.843339582594233, 0.261905835743815, 0.202082465844402,
0.289981426724726, 0.291217371924577, 0.215329401915446, 0.301165703878997,
0.279681282946346, 0.282823671253511), SeUp = c(0.432849871674714,
0.291366496245083, 0.439020592793743, 0.376489115034319, 0.508327069687771,
0.324284213714876, 0.326699339739158, 0.360193869476184, 0.4466407823384,
0.309174449345777, 0.627459764109079, 0.417248290245548, 0.368190986274075,
0.313546689950327, 0.500170605899896), SeLo = c(0.221681937544196,
0.123506825014534, 0.24753428371226, 0.18318696178852, 0.224219940280471,
0.11963635802375, 0.0174803518597485, 0.167814848755473, 0.267681833971993,
0.125570002931353, 0.349727415459271, 0.235381214200561, 0.151799699462811,
0.132395701066921, 0.248246482427503)), row.names = c(374L, 2046L,
441L, 1348L, 634L, 1652L, 1949L, 1794L, 971L, 2040L, 198L, 587L,
1471L, 1983L, 653L), class = "data.frame")
> c_neb4_sample <- c_neb4[sample(nrow(c_neb4), 15), ]
> dput(c_neb4_sample)
structure(list(CYR_Keyfield = c("C-2009-7-11-68", "C-2016-7-17-147",
"C-2012-7-16-156", "C-2019-7-11-135", "C-2018-9-17-159", "C-2010-5-29-122",
"C-2010-5-29-22", "C-2010-5-29-22", "C-2010-9-5-157", "C-2017-8-13-168",
"C-2013-5-26-70", "C-2018-9-17-123", "C-2022-8-13-168", "C-2011-9-4-67",
"C-2017-10-8-123"), Species = c("Cynoscion nebulosus", "Cynoscion nebulosus",
"Cynoscion nebulosus", "Cynoscion nebulosus", "Cynoscion nebulosus",
"Cynoscion nebulosus", "Cynoscion nebulosus", "Cynoscion nebulosus",
"Cynoscion nebulosus", "Cynoscion nebulosus", "Cynoscion nebulosus",
"Cynoscion nebulosus", "Cynoscion nebulosus", "Cynoscion nebulosus",
"Cynoscion nebulosus"), ID = c("2009768_23", "20167147_35", "20127156_20",
"20197135_21", "20189159_24", "20105122_12", "2010522_15", "2010522_14",
"20109157_52", "20178168_134", "2013570_2", "20189123_20", "20228168_41",
"2011967_25", "201710123_122"), SL = c(44.05, 45.66, 45.57, 25.83,
42.54, 18.48, 26.8, 31.97, 48.41, 17.94, 23.17, 44.63, 28.25,
21.07, 25.1), empty = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), DateTime = structure(c(1247343360, 1468778940, 1342456800,
1562867940, 1537209360, 1275138900, 1275143400, 1275143400, 1283705100,
1502655180, 1369589040, 1537212420, 1660402320, 1315149540, 1507481100
), class = c("POSIXct", "POSIXt"), tzone = ""), CYR = c(2009L,
2016L, 2012L, 2019L, 2018L, 2010L, 2010L, 2010L, 2010L, 2017L,
2013L, 2018L, 2022L, 2011L, 2017L), Month = c(7L, 7L, 7L, 7L,
9L, 5L, 5L, 5L, 9L, 8L, 5L, 9L, 8L, 9L, 10L), DoY = c(192, 199,
198, 192, 260, 149, 149, 149, 248, 225, 146, 260, 225, 247, 281
), ToD = c(16.2666666666667, 14.15, 12.6666666666667, 13.9833333333333,
14.6, 9.25, 10.5, 10.5, 12.75, 16.2166666666667, 13.4, 15.45,
10.8666666666667, 11.3166666666667, 12.75), JDay = c(1755, 4318,
2856, 5407, 5110, 2077, 2077, 2077, 2176, 4710, 3170, 5110, 6536,
2540, 4766), Station = c(68, 147, 156, 135, 159, 122, 22, 22,
157, 168, 70, 123, 168, 67, 123), Standard_collection_station = c(0,
0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1), Latitude = c(25.121,
25.046, 25.108, 25.036, 25.07, 25.025, 25.056, 25.056, 25.096,
25.094, 25.132, 25.012, 25.093999961391, 25.134, 25.012), Longitude = c(-80.955,
-80.91, -80.943, -80.924, -80.907, -80.938, -81.039, -81.039,
-80.931, -80.905, -80.941, -80.926, -80.9049999620765, -80.967,
-80.926), Zone = c("West", "West", "West", "West", "West", "West",
"West", "West", "West", "West", "West", "West", "West", "West",
"West"), sal = c(38.7999999999999, 40.74, 37.6, 35.89, 37.67,
38.1, 36.2, 36.2, 37.5, 43, 41.2, 36.59, 40.4, 37, 32.54), temp = c(30.9,
30.83, 29.4, 29.763, 32.7839999999998, 30, 30, 30, 31.5, 33.7999999999998,
27.7, 32.5739999999998, 30.046, 29.8, 29.66), fCYR = structure(c(1L,
8L, 4L, 11L, 10L, 2L, 2L, 2L, 2L, 9L, 5L, 10L, 14L, 3L, 9L), levels = c("2009",
"2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017",
"2018", "2019", "2020", "2021", "2022"), class = "factor"), fStation = structure(c(10L,
35L, 36L, 29L, 39L, 23L, 3L, 3L, 37L, 41L, 11L, 24L, 41L, 9L,
24L), levels = c("20", "21", "22", "23", "24", "40", "54", "65",
"67", "68", "70", "71", "73", "101", "105", "106", "107", "111",
"112", "117", "118", "119", "122", "123", "124", "130", "133",
"134", "135", "137", "143", "144", "145", "146", "147", "156",
"157", "158", "159", "167", "168", "169", "171", "172", "173",
"174", "175", "176", "224", "225", "226", "227", "229", "237",
"239", "240", "241", "253", "254", "255", "256", "257", "265",
"266", "267", "268", "269", "270", "278", "279", "280", "281",
"282", "284", "290", "291", "292", "294", "301", "302", "303",
"304", "312", "313", "315", "323", "609", "619", "621", "622"
), class = "factor"), fZone = structure(c(3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), levels = c("Crocodile",
"Rankin", "West", "Whipray"), class = "factor"), Season = c("Wet",
"Wet", "Wet", "Wet", "Wet", "Wet", "Wet", "Wet", "Wet", "Wet",
"Wet", "Wet", "Wet", "Wet", "Wet")), row.names = c(164L, 255L,
143L, 156L, 178L, 76L, 101L, 93L, 340L, 493L, 3L, 147L, 599L,
181L, 524L), class = "data.frame")
Maybe you shouldn't ignore the warning in the predict
step? You are excluding the wrong smooth there.
Of the terms in the model that are not shown in the plot you are having difficulty with its the s(fStation)
ranef term. I never really used plyr, but it seems you are generating all combinations of the fZone
, fStation
and observed values of SL
(plus an extra value of SL
because you also use evenly()
here?. This would be incorrect for this plot as even though you are removing the effect of the ranef when predicting (or trying to - note the code you show is wrong for this example) you will still have as many rows of data as you have combinations of fZone
, fStation
and plus the observed values of SL
.
Even if my specific diagnosis is incorrect, you clearly have too many data points than needed to generate the plot you want. All you need are n
evenly spaced values over SL
for each level of fZone
, which is 3 * n
.
What you should do is generate the crossing of the levels of fZone
with the 100 evenly spaced values of SL
, and after that, add on a column for fStation
that contains just one level of the random effect, but with the same levels as the original fStation
. Then predict for this much reduced data frame, while excluding the s(fStation)
smooth from the predicted values.
I find this most easy to do using functions from my {gratia} package:
ds <- data_slice(m4,
SL = evenly(SL, n = 100),
fZone = evenly(fZone), # or fZone = levels(fZone)
fStation = ref_level(fStation))
fv <- fitted_values(m4, data = ds, exclude = "s(fStation)",
scale = "response")
The first step creates the data slice for this plot. Note that evenly()
works for factors, returning the levels of the factor as a factor with the correct levels. ref_level()
returns the reference (first) level of the factor with levels equal to the levels of the original factor.
The second step there is doing all of your code with predict()
plus forming the confidence interval on the link/linear predictor scale from the standard errors, and then back transforming to the response scale, but in a single call.
The output from fitted_values()
is in a form amenable for plotting with ggplot()
.
Note that if you don't have some of the functions mentioned in the code, you might need to update {gratia}.