rggplot2plotmutategeom-point

Values not appearing in ggplot plot


When I made a plot using geom_point() and geom_line(), a few values in the middle didn't appear and I can't figure out why.

I have a dataframe with 3 columns (K_value, CV_error, and Run), where K_value is categorical.

dput() output:

    structure(list(K_value = structure(c(10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), levels = c("1", "2", "3", 
    "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"), class = "factor"), 
        CV_error = c(0.3496, 0.48953, 0.22838, 0.3241, 0.48187, 0.81215, 
        0.64932, 0.48208, 0.34502, 0.29175, 0.38106, 0.34349, 0.29372, 
        0.31848, 0.28904, 0.36266, 0.35706, 0.40682, 0.22942, 0.81252, 
        0.66357, 0.48312, 0.34643, 0.29845, 0.33101, 0.44156, 0.32816, 
        0.26834, 0.38874, 0.32601, 0.33054, 0.5124, 0.4978, 0.81195, 
        0.62714, 0.49569, 0.34549, 0.29434, 0.21027, 0.35551, 0.23482, 
        0.36595, 0.33906, 0.30915, 0.38615, 0.42463, 0.38548, 0.81222, 
        0.64116, 0.48115, 0.34543, 0.31653, 0.39421, 0.23617, 0.26476, 
        0.30773, 0.29044, 0.23667, 0.40504, 0.24453, 0.38279, 0.81107, 
        0.62831, 0.48073, 0.34307, 0.25076, 0.18189, 0.24538, 0.30349, 
        0.31099, 0.26404, 0.26664, 0.37712, 0.38249, 0.27946, 0.81362, 
        0.66236, 0.48343, 0.34475, 0.29682, 0.20412, 0.20799, 0.25753, 
        0.28842, 0.25157, 0.41521, 0.34065, 0.24796, 0.30641, 0.81291, 
        0.65986, 0.4821, 0.34447, 0.24829, 0.20115, 0.22076, 0.31345, 
        0.39544, 0.40846, 0.26986, 0.27907, 0.33826, 0.37872, 0.81762, 
        0.65032, 0.48309, 0.34895, 0.31037, 0.39639, 0.222, 0.33737, 
        0.23645, 0.35719, 0.42435, 0.2783, 0.41588, 0.43157, 0.81294, 
        0.6575, 0.47089, 0.34488, 0.24524, 0.29636, 0.22649, 0.23698, 
        0.30698, 0.40407, 0.3819, 0.31701, 0.47138, 0.34162, 0.81551, 
        0.66211, 0.49685, 0.34662, 0.23958, 0.32928, 0.19703, 0.25929, 
        0.29533), Run = c("1", "1", "1", "1", "1", "1", "1", "1", 
        "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", 
        "2", "2", "2", "2", "2", "2", "2", "2", "3", "3", "3", "3", 
        "3", "3", "3", "3", "3", "3", "3", "3", "3", "3", "4", "4", 
        "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", "4", 
        "5", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5", "5", 
        "5", "5", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", 
        "6", "6", "6", "6", "7", "7", "7", "7", "7", "7", "7", "7", 
        "7", "7", "7", "7", "7", "7", "8", "8", "8", "8", "8", "8", 
        "8", "8", "8", "8", "8", "8", "8", "8", "9", "9", "9", "9", 
        "9", "9", "9", "9", "9", "9", "9", "9", "9", "9", "10", "10", 
        "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", 
        "10", "10")), class = "data.frame", row.names = c(NA, -140L
    ))

I wanted to plot the mean rate of change across K_value, like so:

df %>%
   mutate(rate = CV_error - lag(CV_error)) %>%
   mutate(sd = sd(rate),
     .by=K_value) %>%
   mutate(meanrate = mean(rate),
     .by=K_value) %>%
 ggplot(aes(x=K_value, y=meanrate)) + 
 geom_point() + 
 geom_errorbar(aes(ymin=meanrate-sd, ymax=meanrate+sd), width=0.2) + 
 geom_line(aes(group=1)) 

This yields:

enter image description here

Any idea why the data for K=10 is not appearing? The data for K=10 is there in the original df, and when I plot other things (e.g., the mean of CV_error) it shows up fine. I'm wondering if it has something to do with lag(). Am I using it incorrectly?


Solution

  • After I run your mutate lines and look at the data, all of the K_value = 10 rows have NA values. Your data show that K_value is a factor, and despite its levels being in the correct order your data is sorted alphabetically by K_value, not numerically, which mean "10" is the first row within each Run group, and when you lag the first value, you get a missing value....

    df %>%
       mutate(rate = CV_error - lag(CV_error)) %>%
       mutate(sd = sd(rate),
         .by=K_value) %>%
       mutate(meanrate = mean(rate),
         .by=K_value) 
    
        K_value CV_error Run     rate          sd  meanrate
    1        10  0.34960   1       NA          NA        NA
    2        11  0.48953   1  0.13993 0.095314894  0.013977
    3        12  0.22838   1 -0.26115 0.125867371 -0.018266
    4        13  0.32410   1  0.09572 0.107924558  0.046913
    5        14  0.48187   1  0.15777 0.110399314 -0.005331
    6         1  0.81215   1  0.33028 0.085966983  0.441737
    7         2  0.64932   1 -0.16283 0.013051558 -0.163086
    8         3  0.48208   1 -0.16724 0.016702958 -0.166252
    9         4  0.34502   1 -0.13706 0.007179984 -0.138402
    10        5  0.29175   1 -0.05327 0.029021407 -0.066298
    11        6  0.38106   1  0.08931 0.077438538  0.013361
    12        7  0.34349   1 -0.03757 0.111384103 -0.022936
    13        8  0.29372   1 -0.04977 0.082100970  0.013319
    14        9  0.31848   1  0.02476 0.066958840  0.026454
    15       10  0.28904   2 -0.02944          NA        NA
    16       11  0.36266   2  0.07362 0.095314894  0.013977
    ...
    

    If you sort your data correctly first, df |> arrange(Run, K_value) |> ... then you'll have K_value = 1 missing, not K_value = 10 (and correct values otherwise since the order is correct).

    df %>%
      arrange(Run, K_value) |>
       mutate(rate = CV_error - lag(CV_error)) %>%
       mutate(sd = sd(rate),
         .by=K_value) %>%
       mutate(meanrate = mean(rate),
         .by=K_value) %>%
     ggplot(aes(x=K_value, y=meanrate)) + 
     geom_point() + 
     geom_errorbar(aes(ymin=meanrate-sd, ymax=meanrate+sd), width=0.2) + 
     geom_line(aes(group=1)) 
    

    enter image description here

    If you want values for K_value = 1, then you can set a default value in lag to fill in whatever you want for the first row within each group.