When I made a plot using geom_point() and geom_line(), a few values in the middle didn't appear and I can't figure out why.
I have a dataframe with 3 columns (K_value, CV_error, and Run), where K_value is categorical.
dput() output:
structure(list(K_value = structure(c(10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), levels = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"), class = "factor"),
CV_error = c(0.3496, 0.48953, 0.22838, 0.3241, 0.48187, 0.81215,
0.64932, 0.48208, 0.34502, 0.29175, 0.38106, 0.34349, 0.29372,
0.31848, 0.28904, 0.36266, 0.35706, 0.40682, 0.22942, 0.81252,
0.66357, 0.48312, 0.34643, 0.29845, 0.33101, 0.44156, 0.32816,
0.26834, 0.38874, 0.32601, 0.33054, 0.5124, 0.4978, 0.81195,
0.62714, 0.49569, 0.34549, 0.29434, 0.21027, 0.35551, 0.23482,
0.36595, 0.33906, 0.30915, 0.38615, 0.42463, 0.38548, 0.81222,
0.64116, 0.48115, 0.34543, 0.31653, 0.39421, 0.23617, 0.26476,
0.30773, 0.29044, 0.23667, 0.40504, 0.24453, 0.38279, 0.81107,
0.62831, 0.48073, 0.34307, 0.25076, 0.18189, 0.24538, 0.30349,
0.31099, 0.26404, 0.26664, 0.37712, 0.38249, 0.27946, 0.81362,
0.66236, 0.48343, 0.34475, 0.29682, 0.20412, 0.20799, 0.25753,
0.28842, 0.25157, 0.41521, 0.34065, 0.24796, 0.30641, 0.81291,
0.65986, 0.4821, 0.34447, 0.24829, 0.20115, 0.22076, 0.31345,
0.39544, 0.40846, 0.26986, 0.27907, 0.33826, 0.37872, 0.81762,
0.65032, 0.48309, 0.34895, 0.31037, 0.39639, 0.222, 0.33737,
0.23645, 0.35719, 0.42435, 0.2783, 0.41588, 0.43157, 0.81294,
0.6575, 0.47089, 0.34488, 0.24524, 0.29636, 0.22649, 0.23698,
0.30698, 0.40407, 0.3819, 0.31701, 0.47138, 0.34162, 0.81551,
0.66211, 0.49685, 0.34662, 0.23958, 0.32928, 0.19703, 0.25929,
0.29533), Run = c("1", "1", "1", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3",
"3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "4", "4",
"4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4",
"5", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5",
"5", "5", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"6", "6", "6", "6", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "8", "8", "8", "8", "8", "8",
"8", "8", "8", "8", "8", "8", "8", "8", "9", "9", "9", "9",
"9", "9", "9", "9", "9", "9", "9", "9", "9", "9", "10", "10",
"10", "10", "10", "10", "10", "10", "10", "10", "10", "10",
"10", "10")), class = "data.frame", row.names = c(NA, -140L
))
I wanted to plot the mean rate of change across K_value, like so:
df %>%
mutate(rate = CV_error - lag(CV_error)) %>%
mutate(sd = sd(rate),
.by=K_value) %>%
mutate(meanrate = mean(rate),
.by=K_value) %>%
ggplot(aes(x=K_value, y=meanrate)) +
geom_point() +
geom_errorbar(aes(ymin=meanrate-sd, ymax=meanrate+sd), width=0.2) +
geom_line(aes(group=1))
This yields:
Any idea why the data for K=10 is not appearing? The data for K=10 is there in the original df, and when I plot other things (e.g., the mean of CV_error) it shows up fine. I'm wondering if it has something to do with lag()
. Am I using it incorrectly?
After I run your mutate
lines and look at the data, all of the K_value = 10
rows have NA
values. Your data show that K_value
is a factor, and despite its levels being in the correct order your data is sorted alphabetically by K_value
, not numerically, which mean "10"
is the first row within each Run
group, and when you lag
the first value, you get a missing value....
df %>%
mutate(rate = CV_error - lag(CV_error)) %>%
mutate(sd = sd(rate),
.by=K_value) %>%
mutate(meanrate = mean(rate),
.by=K_value)
K_value CV_error Run rate sd meanrate
1 10 0.34960 1 NA NA NA
2 11 0.48953 1 0.13993 0.095314894 0.013977
3 12 0.22838 1 -0.26115 0.125867371 -0.018266
4 13 0.32410 1 0.09572 0.107924558 0.046913
5 14 0.48187 1 0.15777 0.110399314 -0.005331
6 1 0.81215 1 0.33028 0.085966983 0.441737
7 2 0.64932 1 -0.16283 0.013051558 -0.163086
8 3 0.48208 1 -0.16724 0.016702958 -0.166252
9 4 0.34502 1 -0.13706 0.007179984 -0.138402
10 5 0.29175 1 -0.05327 0.029021407 -0.066298
11 6 0.38106 1 0.08931 0.077438538 0.013361
12 7 0.34349 1 -0.03757 0.111384103 -0.022936
13 8 0.29372 1 -0.04977 0.082100970 0.013319
14 9 0.31848 1 0.02476 0.066958840 0.026454
15 10 0.28904 2 -0.02944 NA NA
16 11 0.36266 2 0.07362 0.095314894 0.013977
...
If you sort your data correctly first, df |> arrange(Run, K_value) |> ...
then you'll have K_value = 1 missing, not K_value = 10 (and correct values otherwise since the order is correct).
df %>%
arrange(Run, K_value) |>
mutate(rate = CV_error - lag(CV_error)) %>%
mutate(sd = sd(rate),
.by=K_value) %>%
mutate(meanrate = mean(rate),
.by=K_value) %>%
ggplot(aes(x=K_value, y=meanrate)) +
geom_point() +
geom_errorbar(aes(ymin=meanrate-sd, ymax=meanrate+sd), width=0.2) +
geom_line(aes(group=1))
If you want values for K_value = 1
, then you can set a default
value in lag
to fill in whatever you want for the first row within each group.