I have a simple line plot where I show a DNA sequence on the x-axis, done in the following way with ggplot2
:
myseq <- "AGAATATTATACATTCATCT"
set.seed(123)
mydata <- data.frame(time=1:100, value=rnorm(100, mean=10, sd=2))
indices <- seq(5, 100, length.out=20)
seqsplit <- unlist(strsplit(myseq, ""))
ind_df <- data.frame(call=seqsplit, time=indices)
final_df <- dplyr::left_join(mydata, ind_df, by="time")
xcolors <- ifelse(seqsplit=="A", "green", ifelse(seqsplit=="C", "blue", ifelse(seqsplit=="G", "black", "red")))
P <- ggplot2::ggplot(final_df, ggplot2::aes(x=time, y=value)) +
ggplot2::geom_line(linewidth=0.5) +
ggplot2::scale_x_continuous(breaks=indices, labels=seqsplit) +
ggplot2::scale_y_continuous(limits=c(5,17)) +
ggplot2::theme_light() +
ggplot2::theme(axis.title.x=ggplot2::element_blank(),
axis.text.x=ggtext::element_markdown(face="bold", color=xcolors))
grDevices::pdf(file="test.pdf", height=3, width=10)
print(P)
grDevices::dev.off()
which produces:
Now I have an associated aminoacid sequence of length 4, for which I know the start and end positions in the DNA sequence. Each letter of the aminoacid sequence corresponds to 3 letters in the DNA sequence.
aaseq <- "WXYZ"
start <- 5
end <- 17
Here the aminoacid sequence WXYZ
starts on T-5
and ends on C-17
of the DNA sequence above, and I want to plot them together.
This would be my ultimate goal (it could be just squares instead of "arrows"):
Is there an easy way to accomplish this in ggplot2
?
An easy option without the arrows would be to use a geom_segment
:
geom_segment(
data = df_arrows,
aes(x = x, xend = xend, y = 16, yend = 16, color = I(color)),
linewidth = 8
)
But if you want the arrows then I would suggest to go for geom_polygon
which however requires some effort to create a dataframe with the coordinates for the polygon:
library(ggplot2)
library(dplyr)
aaseq <- "WXYZ"
start <- 5
end <- 17
df_arrows <- data.frame(
x = indices[seq(start, end - 3, 3)],
xend = indices[seq(start + 3, end, 3)],
y = 16, yend = 16,
color = c("blue", "green", "orange", "purple"),
label = strsplit(aaseq, "")[[1]]
)
df_polygon <- df_arrows |>
dplyr::mutate(label = factor(label, rev(unique(label)))) |>
dplyr::reframe(
data.frame(
x = c(x, xend, xend, xend, x) + c(0, 0, 4, 0, 0),
y = y + .5 * c(1, 1, 0, -1, -1),
color = color,
label = label
),
.by = label
)
ggplot(final_df, aes(x = time, y = value)) +
scale_x_continuous(breaks = indices, labels = seqsplit) +
scale_y_continuous(limits = c(5, 17)) +
theme_light() +
theme(
axis.title.x = element_blank(),
axis.text.x = ggtext::element_markdown(face = "bold", color = xcolors)
) +
annotate(
"rect",
xmin = indices[start], xmax = indices[end],
ymin = -Inf, ymax = Inf,
fill = "grey", alpha = .4
) +
geom_polygon(
data = df_polygon,
aes(x = x, y = y, fill = I(color), group = label)
) +
geom_text(
data = df_arrows,
aes(x = (x + xend) / 2 + 1, y = y, label = label),
color = "white", fontface = "bold"
) +
geom_line(linewidth = 0.5)