rggplot2plotbioinformatics

Making lines not overlap


I am creating a DGE plot for 2 samples. I want the lines in the resulting plot to NOT BE STACKED on top of each other, bot to be next to each other (so that both are visible). What I want What I have

## Load packages
if (! require(ggplot2)) {
  install.packages("ggplot2")
}
if (! require(dplyr)) {
  install.packages("dplyr")
}
if (! require(tidyr)) {
  install.packages("tidyr")
}
library(ggplot2)
library(dplyr)
library(tidyr)

gene_order <- c(
  "ADGRA1", "ADGRA2", "ADGRA3",
  "ADGRB1", "ADGRB2", "ADGRB3",
  "CELSR1", "CELSR2", "CELSR3",
  "ADGRD1", "ADGRD2",
  "ADGRE1", "ADGRE2", "ADGRE3", "ADGRE5",
  "ADGRF1", "ADGRF2", "ADGRF3", "ADGRF4", "ADGRF5",
  "ADGRG1", "ADGRG2", "ADGRG3", "ADGRG4", "ADGRG5", "ADGRG6", "ADGRG7",
  "ADGRL1", "ADGRL2", "ADGRL3", "ADGRL4",
  "ADGRV1"
)
gene_order <- rev(gene_order)

## Change variables
file_path <- "/home/pospim/Desktop/Work/bioinformatics/datasets/GSE189727_HIV_done/GSE189727_HIV_dataset1_SZ_DC_Axl_together - analyzed/DGE_radar"
file_name <- paste(file_path,"GPCRS-DE_analysis_DC_CDC2.tsv", sep="/")
plot_name <- paste(file_path,"DGE_plot.png", sep="/")
sample_1 <- "DC"
sample_2 <- "CDC2"

data <- read.table(file_name, header = T, sep = "\t", stringsAsFactors=FALSE)
head(data)

data[] <- lapply(data, function(x) gsub(",",".",x))
head(data)

## Clean data
data$log2FoldChange_1 <- as.numeric(as.character(data$log2FoldChange_1))
data$pvalue_1 <- as.numeric(as.character(data$pvalue_1))
data$log2FoldChange_2 <- as.numeric(as.character(data$log2FoldChange_2))
data$pvalue_2 <- as.numeric(as.character(data$pvalue_2))

## Handle NAs
data$log2FoldChange_1[is.na(data$log2FoldChange_1)] <- 0
data$pvalue_1[is.na(data$pvalue_1)] <- 1
data$log2FoldChange_2[is.na(data$log2FoldChange_2)] <- 0
data$pvalue_2[is.na(data$pvalue_2)] <- 1

## Add log10 pvalue
data$log10_pvalue_1 <- -log10(data$pvalue_1)
data$log10_pvalue_2 <- -log10(data$pvalue_2)

data$gene_symbol <- factor(data$gene_symbol, levels = gene_order)
head(data)

## Convert data to long format for plotting
data_lng <- data %>% 
  pivot_longer(cols = starts_with("log2FoldChange"), names_to = "sample", values_to = "log2FoldChange") %>%
  pivot_longer(cols = starts_with("log10_pvalue"), names_to = "sample_pvalue", values_to = "log10_pvalue") %>%
  mutate(sample = case_when(
    grepl("1", sample) ~ sample_1,
    grepl("2", sample) ~ sample_2
  )) %>%
  filter((sample == sample_1 & sample_pvalue == "log10_pvalue_1") | 
         (sample == sample_2 & sample_pvalue == "log10_pvalue_2")) %>%
  select(gene_symbol, sample, log2FoldChange, log10_pvalue)
head(data_lng)

data_lng$gene_symbol <- factor(data_lng$gene_symbol, levels = gene_order)

## Plot DGE
p <- ggplot(data_lng, aes(x = log2FoldChange, y = gene_symbol, color = sample)) +
  geom_segment(aes(xend=0,yend=gene_symbol),
               linewidth=3, show.legend=TRUE) +
  scale_color_manual(name="Sample", values=setNames(c("blue","cadetblue1"), c(sample_1,sample_2))) +
  labs(title="Differential Gene Expression",
       x = "relative expression (log2FoldChange)", y = "") +
  theme_minimal() 

ggsave(plot_name, plot=p)
print(p)
 
structure(list(gene_symbol = structure(32:1, levels = c("ADGRV1", 
"ADGRL4", "ADGRL3", "ADGRL2", "ADGRL1", "ADGRG7", "ADGRG6", "ADGRG5", 
"ADGRG4", "ADGRG3", "ADGRG2", "ADGRG1", "ADGRF5", "ADGRF4", "ADGRF3", 
"ADGRF2", "ADGRF1", "ADGRE5", "ADGRE3", "ADGRE2", "ADGRE1", "ADGRD2", 
"ADGRD1", "CELSR3", "CELSR2", "CELSR1", "ADGRB3", "ADGRB2", "ADGRB1", 
"ADGRA3", "ADGRA2", "ADGRA1"), class = "factor"), baseMean = c("0", 
"0.660578564", "0", "0", "0", "0", "0.178397561", "1.522702077", 
"1.340522526", "0.840975621", "0.301928182", "0.150964091", "10.13217716", 
"1.313089055", "40.93692353", "0", "0", "0", "0", "0", "0", "0", 
"0", "0", "0.963088173", "0.644225417", "0", "0.838976126", "0", 
"1.410153", "0", "0"), log2FoldChange_1 = c(0, 0.050537239, 0, 
0, 0, 0, -0.971180299, 0.315840199, -0.758324082, 0.551442227, 
1.736471275, 0.952373053, -0.492148172, -0.026726608, -0.21961465, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 2.143565116, -0.081269533, 0, -0.575736451, 
0, 3.93190614, 0, 0), lfcSE = c(NA, "2.658534114", NA, NA, NA, 
NA, "4.080472857", "1.61107253", "1.693790467", "2.186266996", 
"4.048576265", "4.080472857", "0.845207802", "1.689808395", "0.404274885", 
NA, NA, NA, NA, NA, NA, NA, NA, NA, "2.887456086", "2.666317145", 
NA, "2.186422311", NA, "2.479435904", NA, NA), stat = c(NA, "0.019009438", 
NA, NA, NA, NA, "-0.238006803", "0.196043439", "-0.447708319", 
"0.252230047", "0.428909118", "0.233397718", "-0.582280678", 
"-0.015816354", "-0.543231", NA, NA, NA, NA, NA, NA, NA, NA, 
NA, "0.742371504", "-0.03048007", NA, "-0.263323535", NA, "1.585806729", 
NA, NA), pvalue_1 = c(1, 0.984833577, 1, 1, 1, 1, 0.811875818, 
0.844576166, 0.654363717, 0.800863255, 0.667989364, 0.815452585, 
0.560377638, 0.987380901, 0.586970765, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 0.45786228, 0.975684188, 1, 0.792301222, 1, 0.112783159, 1, 
1), padj = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
"0.999957765", NA, "0.999957765", NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), log2FoldChange_2 = c(0, 
0.742371504, 0, 0, 0, 0, -0.971180299, -0.575736451, -0.758324082, 
0.551442227, 1.736471275, 0.952373053, -0.492148172, -0.026726608, 
-0.21961465, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.050537239, -0.081269533, 
0, -0.575736451, 0, 2.887456086, 0, 0), pvalue_2 = c(1, 0.984833577, 
1, 1, 1, 1, 0.811875818, 0.844576166, 0.654363717, 0.800863255, 
0.667989364, 0.815452585, 0.560377638, 0.987380901, 0.586970765, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 0.45786228, 0.975684188, 1, 0.792301222, 
1, 0.112783159, 1, 1), log10_pvalue_1 = c(0, 0.00663715295120108, 
0, 0, 0, 0, 0.0905103940122029, 0.0733611785596228, 0.184180789323803, 
0.0964416320665321, 0.175230452483861, 0.0886012862135488, 0.251519203662807, 
0.00551527763353332, 0.231383528931875, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0.33926513341622, 0.0106907331570114, 0, 0.101109674241976, 
0, 0.947755745205597, 0, 0), log10_pvalue_2 = c(0, 0.00663715295120108, 
0, 0, 0, 0, 0.0905103940122029, 0.0733611785596228, 0.184180789323803, 
0.0964416320665321, 0.175230452483861, 0.0886012862135488, 0.251519203662807, 
0.00551527763353332, 0.231383528931875, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0.33926513341622, 0.0106907331570114, 0, 0.101109674241976, 
0, 0.947755745205597, 0, 0)), row.names = c(NA, -32L), class = "data.frame")

Showing 1 to 19 of 32 entries, 11 total columns

I tried jitter and offset as GPT suggested, but it did not work.


Solution

  • One option to achieve your desired result easily would be to use a geom_col with position="dodge" instead of a geom_segment. Additionally this requires to map on fill instead of color:

    library(ggplot2)
    
    ggplot(data_lng, aes(x = log2FoldChange, y = gene_symbol, fill = sample)) +
      geom_col(
        position = "dodge"
      ) +
      scale_fill_manual(
        name = "Sample",
        values = setNames(
          c("blue", "cadetblue1"),
          c(sample_1, sample_2)
        )
      ) +
      labs(
        title = "Differential Gene Expression",
        x = "relative expression (log2FoldChange)", y = ""
      ) +
      theme_minimal()