rggplot2

Geom_line and geom_point not connecting for all paired samples


I had asked a previous question on StackOverflow and followed the accepted answer (previous post here)

However, now with a new dataset, I am unable to replicate the plot. I find that now, sometimes the geom_line is 'short' and not able to connect all the paired points. Also, other times the line goes through the points and does not start/end at the centre of the points.

My code below:

#data
df <- structure(list(sample_id = c("PB73-4", "PB73-4", "PB73-4", "PB73-4", 
"PB73-4", "PB73-4", "PB73-4", "PB73-4", "PB73-4", "PB73-4", "PB73-4", 
"PB73-4", "PB81", "PB81", "PB81", "PB81", "PB81", "PB81", "PB81", 
"PB81", "PB81", "PB81", "PB81", "PB81", "PB73-4", "PB73-4", "PB73-4", 
"PB73-4", "PB73-4", "PB73-4", "PB73-4", "PB73-4", "PB73-4", "PB73-4", 
"PB73-4", "PB73-4", "PB81", "PB81", "PB81", "PB81", "PB81", "PB81", 
"PB81", "PB81", "PB81", "PB81", "PB81", "PB81"), Tissue = c("SF", 
"SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", 
"SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", 
"SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", 
"SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", "SF", 
"SF", "SF", "SF"), Group = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), levels = c("HC PBMC", 
"axSpA PBMC", "axSpA SFMC", "InEx", "PD-1+ TIGIT+", "ReA PBMC", 
"ReA SFMC"), class = "factor"), clusters_names = structure(c(6L, 
1L, 10L, 5L, 7L, 3L, 11L, 4L, 2L, 12L, 8L, 9L, 3L, 1L, 8L, 5L, 
7L, 4L, 10L, 6L, 2L, 11L, 9L, 12L, 4L, 7L, 1L, 6L, 5L, 3L, 2L, 
12L, 8L, 10L, 9L, 11L, 2L, 11L, 1L, 3L, 8L, 6L, 5L, 10L, 7L, 
4L, 12L, 9L), levels = c("GZMK+CXCR3-high", "CD69+ITGAE+ TRM.1", 
"ISG-high", "CD69+ITGAE+ TRM.2", "GZMK+CXCR3-low", "CD69+ITGAE+ TRM.3", 
"GNLY+GZMB+", "Mito-high", "CD69+ITGAE- TRM", "GZMK+GZMB+", "Proliferating", 
"TCM-high"), class = "factor"), CD8TcellCountbytissue = c(2343L, 
2343L, 2343L, 2343L, 2343L, 2343L, 2343L, 2343L, 2343L, 2343L, 
2343L, 2343L, 3686L, 3686L, 3686L, 3686L, 3686L, 3686L, 3686L, 
3686L, 3686L, 3686L, 3686L, 3686L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2081L, 
2081L, 2081L, 2081L, 2081L, 2081L, 2081L, 2081L, 2081L, 2081L, 
2081L, 2081L), patientcd8clustersizebytissue = c(342L, 794L, 
159L, 300L, 104L, 374L, 23L, 95L, 62L, 9L, 42L, 39L, 1166L, 658L, 
200L, 631L, 106L, 129L, 142L, 506L, 75L, 39L, 29L, 5L, 127L, 
92L, 814L, 186L, 222L, 147L, 92L, 6L, 194L, 85L, 38L, 15L, 94L, 
12L, 393L, 718L, 101L, 158L, 356L, 49L, 47L, 129L, 6L, 18L), 
    freqcd8bytissue = c(14.5966709346991, 33.8881775501494, 6.78617157490397, 
    12.8040973111396, 4.43875373452838, 15.962441314554, 0.9816474605207, 
    4.0546308151942, 2.64618011096884, 0.384122919334187, 1.79257362355954, 
    1.66453265044814, 31.6332067281606, 17.8513293543136, 5.42593597395551, 
    17.1188279978296, 2.87574606619642, 3.4997287032013, 3.85241454150841, 
    13.7276180141074, 2.03472599023332, 1.05805751492132, 0.786760716223549, 
    0.135648399348888, 6.29335976214073, 4.5589692765114, 40.3369672943508, 
    9.21704658077304, 11.0009910802775, 7.28444003964321, 4.5589692765114, 
    0.297324083250743, 9.61347869177403, 4.21209117938553, 1.88305252725471, 
    0.743310208126858, 4.51705910619894, 0.576645843344546, 18.8851513695339, 
    34.5026429601153, 4.85343584814993, 7.59250360403652, 17.1071600192215, 
    2.3546371936569, 2.25852955309947, 6.19894281595387, 0.288322921672273, 
    0.864968765016819), Group2 = c("InEx", "InEx", "InEx", "InEx", 
    "InEx", "InEx", "InEx", "InEx", "InEx", "InEx", "InEx", "InEx", 
    "InEx", "InEx", "InEx", "InEx", "InEx", "InEx", "InEx", "InEx", 
    "InEx", "InEx", "InEx", "InEx", "axSpA SFMC", "axSpA SFMC", 
    "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", 
    "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", 
    "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", 
    "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", "axSpA SFMC", 
    "axSpA SFMC", "axSpA SFMC")), row.names = c(NA, -48L), class = c("tbl_df", 
"tbl", "data.frame"))



#stats
stat.test_df <- df %>%
  group_by(clusters_names) %>%
  pairwise_t_test(freqcd8bytissue ~ Group2, p.adjust.method = "none") %>%
  add_significance()

stat.test_df <- stat.test_df %>% add_xy_position(x = "Group2")

#plot
arrange(df, clusters_names, sample_id) %>%
      ggplot(aes(x = Group2, y = freqcd8bytissue)) + 
      geom_point(aes(color = Group2), size=3, position=position_jitter(w=0.1, h=0, seed = 1)) +  
      geom_line(aes(group = sample_id), position=position_jitter(w=0.2, h=0, seed = 1)) +
      ylab("Frequency of cluster in all CD8+CD45RO+ cells in axSpA, per patient") + stat_pvalue_manual(stat.test_df, label = "p.adj") +
      scale_color_manual(values = c("#f2a21e", "#88a0c4")) + 
      facet_wrap(~ clusters_names) + theme_bw() + theme(legend.position = "none") + xlab(NULL)

The resulting plot below:

Plot


Solution

  • use the same jitter for both the points and lines

    jitter = position_jitter(width = 0.1, height = 0, seed = 1)
    

    then use geom_path() instead of geom_line()

    geom_path(aes(group = sample_id), position = jitter) +
    geom_point(aes(color = Group2), position = jitter, size=3) +