I am creating a DGE plot for 2 samples. I want the lines in the resulting plot to NOT BE STACKED on top of each other, bot to be next to each other (so that both are visible).
## Load packages
if (! require(ggplot2)) {
install.packages("ggplot2")
}
if (! require(dplyr)) {
install.packages("dplyr")
}
if (! require(tidyr)) {
install.packages("tidyr")
}
library(ggplot2)
library(dplyr)
library(tidyr)
gene_order <- c(
"ADGRA1", "ADGRA2", "ADGRA3",
"ADGRB1", "ADGRB2", "ADGRB3",
"CELSR1", "CELSR2", "CELSR3",
"ADGRD1", "ADGRD2",
"ADGRE1", "ADGRE2", "ADGRE3", "ADGRE5",
"ADGRF1", "ADGRF2", "ADGRF3", "ADGRF4", "ADGRF5",
"ADGRG1", "ADGRG2", "ADGRG3", "ADGRG4", "ADGRG5", "ADGRG6", "ADGRG7",
"ADGRL1", "ADGRL2", "ADGRL3", "ADGRL4",
"ADGRV1"
)
gene_order <- rev(gene_order)
## Change variables
file_path <- "/home/pospim/Desktop/Work/bioinformatics/datasets/GSE189727_HIV_done/GSE189727_HIV_dataset1_SZ_DC_Axl_together - analyzed/DGE_radar"
file_name <- paste(file_path,"GPCRS-DE_analysis_DC_CDC2.tsv", sep="/")
plot_name <- paste(file_path,"DGE_plot.png", sep="/")
sample_1 <- "DC"
sample_2 <- "CDC2"
data <- read.table(file_name, header = T, sep = "\t", stringsAsFactors=FALSE)
head(data)
data[] <- lapply(data, function(x) gsub(",",".",x))
head(data)
## Clean data
data$log2FoldChange_1 <- as.numeric(as.character(data$log2FoldChange_1))
data$pvalue_1 <- as.numeric(as.character(data$pvalue_1))
data$log2FoldChange_2 <- as.numeric(as.character(data$log2FoldChange_2))
data$pvalue_2 <- as.numeric(as.character(data$pvalue_2))
## Handle NAs
data$log2FoldChange_1[is.na(data$log2FoldChange_1)] <- 0
data$pvalue_1[is.na(data$pvalue_1)] <- 1
data$log2FoldChange_2[is.na(data$log2FoldChange_2)] <- 0
data$pvalue_2[is.na(data$pvalue_2)] <- 1
## Add log10 pvalue
data$log10_pvalue_1 <- -log10(data$pvalue_1)
data$log10_pvalue_2 <- -log10(data$pvalue_2)
data$gene_symbol <- factor(data$gene_symbol, levels = gene_order)
head(data)
## Convert data to long format for plotting
data_lng <- data %>%
pivot_longer(cols = starts_with("log2FoldChange"), names_to = "sample", values_to = "log2FoldChange") %>%
pivot_longer(cols = starts_with("log10_pvalue"), names_to = "sample_pvalue", values_to = "log10_pvalue") %>%
mutate(sample = case_when(
grepl("1", sample) ~ sample_1,
grepl("2", sample) ~ sample_2
)) %>%
filter((sample == sample_1 & sample_pvalue == "log10_pvalue_1") |
(sample == sample_2 & sample_pvalue == "log10_pvalue_2")) %>%
select(gene_symbol, sample, log2FoldChange, log10_pvalue)
head(data_lng)
data_lng$gene_symbol <- factor(data_lng$gene_symbol, levels = gene_order)
## Plot DGE
p <- ggplot(data_lng, aes(x = log2FoldChange, y = gene_symbol, color = sample)) +
geom_segment(aes(xend=0,yend=gene_symbol),
linewidth=3, show.legend=TRUE) +
scale_color_manual(name="Sample", values=setNames(c("blue","cadetblue1"), c(sample_1,sample_2))) +
labs(title="Differential Gene Expression",
x = "relative expression (log2FoldChange)", y = "") +
theme_minimal()
ggsave(plot_name, plot=p)
print(p)
structure(list(gene_symbol = structure(32:1, levels = c("ADGRV1",
"ADGRL4", "ADGRL3", "ADGRL2", "ADGRL1", "ADGRG7", "ADGRG6", "ADGRG5",
"ADGRG4", "ADGRG3", "ADGRG2", "ADGRG1", "ADGRF5", "ADGRF4", "ADGRF3",
"ADGRF2", "ADGRF1", "ADGRE5", "ADGRE3", "ADGRE2", "ADGRE1", "ADGRD2",
"ADGRD1", "CELSR3", "CELSR2", "CELSR1", "ADGRB3", "ADGRB2", "ADGRB1",
"ADGRA3", "ADGRA2", "ADGRA1"), class = "factor"), baseMean = c("0",
"0.660578564", "0", "0", "0", "0", "0.178397561", "1.522702077",
"1.340522526", "0.840975621", "0.301928182", "0.150964091", "10.13217716",
"1.313089055", "40.93692353", "0", "0", "0", "0", "0", "0", "0",
"0", "0", "0.963088173", "0.644225417", "0", "0.838976126", "0",
"1.410153", "0", "0"), log2FoldChange_1 = c(0, 0.050537239, 0,
0, 0, 0, -0.971180299, 0.315840199, -0.758324082, 0.551442227,
1.736471275, 0.952373053, -0.492148172, -0.026726608, -0.21961465,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2.143565116, -0.081269533, 0, -0.575736451,
0, 3.93190614, 0, 0), lfcSE = c(NA, "2.658534114", NA, NA, NA,
NA, "4.080472857", "1.61107253", "1.693790467", "2.186266996",
"4.048576265", "4.080472857", "0.845207802", "1.689808395", "0.404274885",
NA, NA, NA, NA, NA, NA, NA, NA, NA, "2.887456086", "2.666317145",
NA, "2.186422311", NA, "2.479435904", NA, NA), stat = c(NA, "0.019009438",
NA, NA, NA, NA, "-0.238006803", "0.196043439", "-0.447708319",
"0.252230047", "0.428909118", "0.233397718", "-0.582280678",
"-0.015816354", "-0.543231", NA, NA, NA, NA, NA, NA, NA, NA,
NA, "0.742371504", "-0.03048007", NA, "-0.263323535", NA, "1.585806729",
NA, NA), pvalue_1 = c(1, 0.984833577, 1, 1, 1, 1, 0.811875818,
0.844576166, 0.654363717, 0.800863255, 0.667989364, 0.815452585,
0.560377638, 0.987380901, 0.586970765, 1, 1, 1, 1, 1, 1, 1, 1,
1, 0.45786228, 0.975684188, 1, 0.792301222, 1, 0.112783159, 1,
1), padj = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
"0.999957765", NA, "0.999957765", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), log2FoldChange_2 = c(0,
0.742371504, 0, 0, 0, 0, -0.971180299, -0.575736451, -0.758324082,
0.551442227, 1.736471275, 0.952373053, -0.492148172, -0.026726608,
-0.21961465, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.050537239, -0.081269533,
0, -0.575736451, 0, 2.887456086, 0, 0), pvalue_2 = c(1, 0.984833577,
1, 1, 1, 1, 0.811875818, 0.844576166, 0.654363717, 0.800863255,
0.667989364, 0.815452585, 0.560377638, 0.987380901, 0.586970765,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0.45786228, 0.975684188, 1, 0.792301222,
1, 0.112783159, 1, 1), log10_pvalue_1 = c(0, 0.00663715295120108,
0, 0, 0, 0, 0.0905103940122029, 0.0733611785596228, 0.184180789323803,
0.0964416320665321, 0.175230452483861, 0.0886012862135488, 0.251519203662807,
0.00551527763353332, 0.231383528931875, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.33926513341622, 0.0106907331570114, 0, 0.101109674241976,
0, 0.947755745205597, 0, 0), log10_pvalue_2 = c(0, 0.00663715295120108,
0, 0, 0, 0, 0.0905103940122029, 0.0733611785596228, 0.184180789323803,
0.0964416320665321, 0.175230452483861, 0.0886012862135488, 0.251519203662807,
0.00551527763353332, 0.231383528931875, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.33926513341622, 0.0106907331570114, 0, 0.101109674241976,
0, 0.947755745205597, 0, 0)), row.names = c(NA, -32L), class = "data.frame")
Showing 1 to 19 of 32 entries, 11 total columns
I tried jitter and offset as GPT suggested, but it did not work.
One option to achieve your desired result easily would be to use a geom_col
with position="dodge"
instead of a geom_segment
. Additionally this requires to map on fill
instead of color
:
library(ggplot2)
ggplot(data_lng, aes(x = log2FoldChange, y = gene_symbol, fill = sample)) +
geom_col(
position = "dodge"
) +
scale_fill_manual(
name = "Sample",
values = setNames(
c("blue", "cadetblue1"),
c(sample_1, sample_2)
)
) +
labs(
title = "Differential Gene Expression",
x = "relative expression (log2FoldChange)", y = ""
) +
theme_minimal()