rggplot2smoothing

problem with color ggplot lines and smoothness


I am in trouble trying to create a plot with different lines with different colors and shapes using the the following dataset:

structure(list(age = structure(c(21, 45, 15, 16, 16, 16, 17, 
17, 17, 18, 18, 8, 9, 19, 19, 10, 10, 20, 21, 21, 21, 22, 22, 
22, 23, 24, 30, 42, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 
28, 28, 32, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 
33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 
38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 
44, 44, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 
49, 49, 50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 
54, 55, 52, 55, 56, 56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 
60, 60, 60, 61, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 
65, 65, 61, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70, 
70, 71, 72, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 
76, 76, 74, 77, 77, 77, 78, 78, 78, 79, 79, 79, 80, 80, 80, 81, 
81, 61, 82, 82, 82, 83, 83, 83, 84, 84, 84, 85, 85, 85, 86, 86, 
86, 87, 87, 87, 88, 88, 88, 89, 89, 89, 90, 90, 90, 91, 91, 91, 
92, 92, 92, 93, 93, 93, 94, 94, 94, 95, 95, 96, 96, 97, 98, 98, 
99, 123), label = "age", format.stata = "%10.0g"), healthy = structure(c(0, 
1, NA, 0, 1, NA, 0, 0, 1, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, 
NA, 0, 1, NA, 0, 1, NA, 0, 0, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 
0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 
1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, 
NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 
0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 
1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, 
NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 
0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 
1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, 
NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 
0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 
1, NA, 0, 1, NA, 0, 1, NA, 0, 1, NA, 0, 1, 0, 1, 0, 0, 1, 0, 
0), label = "Has no health condition", format.stata = "%9.0g"), 
    col1 = c(NaN, NaN, NaN, NaN, NaN, 17.4959526062012, 
    NaN, NaN, 13.7867650985718, NaN, NaN, 17.1148929595947, NaN, 
    NaN, 21.7456340789795, NaN, NaN, 22.5867776870728, NaN, NaN, 
    19.4799966812134, NaN, NaN, 27.0873068896207, NaN, NaN, 25.2461756229401, 
    NaN, NaN, 24.2968890402052, NaN, NaN, 24.8148552349636, NaN, 
    NaN, 26.4674949645996, NaN, NaN, 28.1996012926102, NaN, NaN, 
    25.7581091680025, NaN, NaN, 28.0744308040988, NaN, NaN, 2.6226562442202, 
    NaN, NaN, 27.8483320304326, NaN, NaN, 31.5788489391929, NaN, 
    NaN, 28.7516339432959, NaN, NaN, 30.5037746810913, NaN, NaN, 
    29.9186396184175, NaN, NaN, 30.4269195417079, NaN, NaN, 7.7028050581614, 
    NaN, NaN, 29.524931703295, NaN, NaN, 29.4119287666522, NaN, 
    NaN, 27.8233588773813, NaN, NaN, 27.2971927142534, NaN, NaN, 
    29.1216611549503, NaN, NaN, 31.0940565321181, NaN, NaN, 8.9319949883681, 
    NaN, NaN, 29.896419574688, NaN, NaN, 28.0839774861055, NaN, 
    NaN, 28.9685502633816, NaN, NaN, 28.4690587390553, NaN, NaN, 
    28.9532592956056, NaN, NaN, 30.5288856335175, NaN, NaN, 29.1697682274712, 
    NaN, NaN, 30.3849462162365, NaN, NaN, 31.564100275437, NaN, 
    NaN, 31.2172098477681, NaN, NaN, 29.8669878641764, NaN, NaN, 
    29.011706361867, NaN, NaN, 34.0714236164952, NaN, NaN, 30.5197359085083, 
    NaN, NaN, 30.1599007987976, NaN, NaN, 29.1693642373179, NaN, 
    NaN, 30.4129105788011, 2.344, NaN, 31.452887409336, NaN, NaN, 
    33.9476460350884, NaN, NaN, 34.9800090471904, NaN, NaN, 31.3496222882657, 
    NaN, NaN, 33.3435718158506, NaN, NaN, 33.5722597038353, NaN, 
    NaN, 30.3194521300647, NaN, NaN, 34.8352882758431, NaN, NaN, 
    32.2088109652201, NaN, NaN, 36.3922643159565, NaN, NaN, 33.6284904612435, 
    NaN, NaN, 36.0349838032442, NaN, NaN, 37.3032214742311, NaN, 
    NaN, 35.8305793603261, NaN, NaN, 33.5542264086135, NaN, NaN, 
    37.3956505526667, NaN, NaN, 35.6306561260689, NaN, NaN, 34.1713064738682, 
    NaN, NaN, 34.4658223736671, NaN, NaN, 33.7924506975257, NaN, 
    NaN, 35.8171516060829, NaN, NaN, 32.4849494457245, NaN, NaN, 
    38.3814140768612, NaN, NaN, 34.3103677204677, NaN, NaN, 28.7768711447716, 
    NaN, NaN, 30.117017004225, NaN, NaN, 34.4370415551322, NaN, 
    NaN, 37.1509838104248, NaN, NaN, 39.7904551029205, NaN, NaN, 
    45.0848134358724, NaN, NaN, 38.9700946807861, NaN, NaN, 44.7834892272949, 
    NaN, NaN, 18.0415954589844, NaN, NaN, NaN, NaN, NaN, NaN, 
    NaN, NaN, NaN), col2 = c(9.73884888912769, 3.55150119134575, 
    NaN, 9.69603123615697, 3.53710220541273, NaN, 9.66741126191382, 
    3.51121828223117, NaN, 9.63565664291382, 3.48193562582294, 
    NaN, 9.45292086436831, 3.47054993978111, NaN, 9.59095467716815, 
    3.4745261718521, NaN, 9.39332915389019, 3.4410732194279, 
    NaN, 9.45476519597041, 3.43423518675641, NaN, 9.36457543146043, 
    3.41311541986672, NaN, 9.36624081511247, 3.4122371215694, 
    NaN, 9.40900721373381, 3.40614468643151, NaN, 9.38348010778427, 
    3.38705124429694, NaN, 9.34997592101226, 3.38823599217939, 
    NaN, 9.35146037391994, 3.36121070384979, NaN, 9.305238659099, 
    3.36387685238654, NaN, 9.24218050638835, 3.36620352413733, 
    NaN, 9.28473771413167, 3.34838822679441, NaN, 9.12819498328752, 
    3.3376117037953, NaN, 9.16214960674907, 3.34076386559792, 
    NaN, 9.20669736862183, 3.33245655816638, NaN, 9.16058754702227, 
    3.34403007896617, NaN, 9.19735798665455, 3.31413558165537, 
    NaN, 9.09941055815099, 3.30938078784778, NaN, 9.17626369700712, 
    3.30820650180788, NaN, 9.10454079223006, 3.32709397527528, 
    NaN, 9.18342437010545, 3.31390441284663, NaN, 9.15841562379666, 
    3.31152755907661, NaN, 9.06430034915896, 3.31614059893811, 
    NaN, 9.15609089195306, 3.30047230137553, NaN, 9.14333961931474, 
    3.31495148878472, NaN, 9.18908043650838, 3.31331432326157, 
    NaN, 9.13065004079355, 3.33573157658996, NaN, 9.09754027804813, 
    3.33558044968891, NaN, 9.20422429604964, 3.33170155550341, 
    NaN, 9.24062031539029, 3.34154178537922, NaN, 9.24709942419204, 
    3.35687677938486, NaN, 9.31868435786321, 3.36500219054192, 
    NaN, 9.26267687479655, 3.38257493265543, NaN, 9.34868538379669, 
    3.39329878659382, NaN, 9.39233847531405, 3.39171330950282, 
    NaN, 9.39422841092725, 3.42206691953289, NaN, 9.44792991372483, 
    3.41481578063612, NaN, 9.51408845773015, 3.44435220991933, 
    NaN, 9.52255477905273, 3.46103483781632, NaN, 9.54275298855968, 
    3.47304158897723, NaN, 9.58754419846968, 3.50915765865764, 
    NaN, 9.74308764395402, 3.50377081416107, NaN, 9.72430517088692, 
    3.53841964457851, NaN, 9.59207115447122, 3.55271640356007, 
    NaN, 9.72684358766941, 3.56763132344122, NaN, 9.87622256372489, 
    3.57748130932605, NaN, 9.86901096675707, 3.59796099073431, 
    NaN, 9.89924946358676, 3.63481671815923, NaN, 10.0701293576743, 
    3.63426666021913, NaN, 10.0823068306825, 3.65953091444696, 
    NaN, 10.13834406363, 3.70572604793357, NaN, 10.2659117362403, 
    3.74113381209494, NaN, 10.3758992457019, 3.75834940744368, 
    NaN, 10.4749858366045, 3.80154993331019, NaN, 10.4794574134094, 
    3.82382008481566, NaN, 10.5577326643056, 3.86617674629581, 
    NaN, 10.6963342138603, 3.88153685222973, NaN, 10.8680706911309, 
    3.91069514101202, NaN, 10.8778532635082, 3.96953968384966, 
    NaN, 10.9667280233359, 4.00885742770301, NaN, 11.0575908298141, 
    3.99355643777286, NaN, 11.2109310068983, 4.09530009965639, 
    NaN, 11.2052431462416, 4.10855156691499, NaN, 11.4367402182685, 
    4.14386796464725, NaN, 11.4933553970966, 4.2487904576972, 
    NaN, 11.7320929695578, 4.31915928588973, NaN, 11.8614741487706, 
    4.30428578172411, NaN, 12.023966105779, 4.36698396164074, 
    NaN, 12.2372739829269, 4.42745336266451, NaN, 12.2063292094639, 
    4.49448736011982, NaN, 12.3472878138224, 4.46298309734889, 
    NaN, 12.6991779892533, 4.59589936998155, NaN, 12.7648248076439, 
    4.691696030753, NaN, 12.9378400802612, 4.79949178695679, 
    NaN, 13.0320794582367, 4.80274267196655, NaN, 13.6076149940491, 
    4.75885391235352, 13.6394104003906, 5.07671976089478, 13.3812193870544, 
    14.5171988010406, 4.9033317565918, 13.9298572540283, 15.8893375396729
    )), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -249L), groups = structure(list(age = structure(c(15, 
14, 16, 22, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 
96, 97, 98, 99, 103), label = "age", format.stata = "%10.0g"), 
    .rows = structure(list(1:3, 4:6, 7:9, 10:12, 13:15, 16:18, 
        19:21, 22:24, 25:27, 28:30, 31:33, 34:36, 37:39, 40:42, 
        43:45, 46:48, 49:51, 52:54, 55:57, 58:60, 61:63, 64:66, 
        67:69, 70:72, 73:75, 76:78, 79:81, 82:84, 85:87, 88:90, 
        91:93, 94:96, 97:99, 100:102, 103:105, 106:108, 109:111, 
        112:114, 115:117, 118:120, 121:123, 124:126, 127:129, 
        130:132, 133:135, 136:138, 139:141, 142:144, 145:147, 
        148:150, 151:153, 154:156, 157:159, 160:162, 163:165, 
        166:168, 169:171, 172:174, 175:177, 178:180, 181:183, 
        184:186, 187:189, 190:192, 193:195, 196:198, 199:201, 
        202:204, 205:207, 208:210, 211:213, 214:216, 217:219, 
        220:222, 223:225, 226:228, 229:231, 232:234, 235:237, 
        238:240, 241:242, 243:244, 245L, 246:247, 248L, 249L), ptype = integer(0), class = 
   c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -86L), .drop = TRUE))

I am creating a plot like this:

enter image description here

I created the next code:

 data3|> ggplot(aes(x=age,y=col1,group=factor(healthy)))+geom_smooth()+
 geom_point(aes(x=age,y=col21))+scale_fill_lancet()+
  scale_fill_discrete(labels=c("Healthy","Sick","SCI"))

The code have generated the following plot. enter image description here

But I cannot add different shapes or colors for the generated lines. How to deal with this issue?


Solution

  • There are a few issues in your code. At a surface level, you should be using scale_color_discrete, not scale_fill_discrete:

    data3|> 
      ggplot(aes(x=age,y=swiscihat1,color=factor(healthy)))+
      geom_smooth()+
      geom_point(aes(x=age,y=shshat1))+
      scale_color_discrete(labels=c("Healthy","Sick","SCI"))
    

    enter image description here

    Unfortunately, you have a larger problem. The color for geom_smooth is not linked to a value in "healthy", which means it is very difficult to make it appear in a unified legend with the other two lines. However, it is possible to trick ggplot into doing this:

    data3 |> 
      ggplot() + 
      geom_smooth(aes(x = age, y = swiscihat1, color = 'SCI', lty = 'SCI'), se = F, show.legend = F) +
      geom_line(aes(x = age, y = shshat1, color = factor(healthy), lty = factor(healthy))) + 
      scale_color_manual(values = c('#82c1e9', '#2e6d65', '#3e647d'), labels=c("Healthy","Sick", 'SCI'), na.translate = F) +
      scale_linetype_manual(values = c(1, 2, 1), labels=c("Healthy","Sick", 'SCI'), na.translate = F, guide = guide_none()) +
      theme_bw() +
      labs(x = 'Age', y = 'Visits to Healthcare Provider', color = NULL)
    

    enter image description here

    But that is a strange way to use ggplot, and somewhat difficult to follow. You have to specify the literal string "SCI" as aesthetics in geom_smooth, repeat aesthetic definitions elsewhere, and suppress a lot of ggplot's default (intended) behavior.

    Instead, I would precompute what I intend to plot.

    First, compute the smooth:

    data3.loess <- loess(swiscihat1 ~ age, data = data3)
    

    Then we'll create data4, which aligns all the values for the plot in one dataset:

    data4 <- data3 |> 
      ungroup() |> 
      filter(!is.na(shshat1)) |> 
      mutate(smooth = predict(data3.loess, newdata = data.frame(age, swiscihat1))) |> 
      select(age, healthy, shshat1, smooth) |> 
      pivot_wider(id_cols = c(age, smooth), names_from = healthy, values_from = shshat1) |> 
      pivot_longer(-age) |> 
      mutate(label = case_when(
        name == 'smooth' ~ 'SCI',
        name == '0' ~ 'Sick',
        name == '1' ~ 'Healthy'
      ), label = factor(label, c('SCI', 'Sick', 'Healthy'), ordered = T))
    
         age name   value label  
       <dbl> <chr>  <dbl> <ord>  
     1    15 smooth NA    SCI    
     2    15 0       9.74 Sick   
     3    15 1       3.55 Healthy
     4    16 smooth 18.5  SCI    
     5    16 0       9.70 Sick   
     6    16 1       3.54 Healthy
     7    17 smooth 19.3  SCI    
     8    17 0       9.67 Sick   
     9    17 1       3.51 Healthy
    10    18 smooth 20.0  SCI    
    # … with 248 more rows
    

    And finally, this greatly simplifies the call to ggplot:

    data4 |> 
      ggplot(aes(x = age, y = value, color = label, lty = label)) +
      geom_line() +
      scale_color_manual(values = c( '#3e647d', '#82c1e9', '#2e6d65')) +
      scale_linetype_manual(values = c(1, 1, 2)) +
      theme_bw() +
      labs(x = 'Age', y = 'Visits to Healthcare Provider', color = NULL, linetype = NULL)
    

    enter image description here