rgenomicranges

Plot the longest transcript in GenomicRanges with ggbio


I am trying to plot an specific region using ggbio. I am using the below code that produced my desire output, except that it contains several transcript. Is it possible to only plot the longest transcript? I've not been able to access the genomic ranges object within Homo.sapiens that I assume contains this information.

library(ggbio)
library(Homo.sapiens)

range <- GRanges("chr10"  , IRanges(start = 78000000 , end = 79000000))
p.txdb <- autoplot(Homo.sapiens, which = range)
p.txdb

Solution

  • Here is a solution that involves filtering TxDb.Hsapiens.UCSC.hg19.knownGene on the longest transcript by gene_id (which does remove genes without gene_id):

    suppressPackageStartupMessages({
        invisible(lapply(c("ggbio", "biovizBase", "data.table", 
            "TxDb.Hsapiens.UCSC.hg19.knownGene", 
            "org.Hs.eg.db"),
            require, character.only = TRUE))})
    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
    
    # retrieve transcript lengths
    txlen <- transcriptLengths(txdb, with.utr5_len=TRUE, with.utr3_len=TRUE)
    setDT(txlen)
    txlen$len <- rowSums(as.matrix(txlen[, .(tx_len, utr5_len, utr3_len)]))
    setkey(txlen, gene_id, len, tx_id)
    
    # filter longesttranscript by gene_id
    ltx <- txlen[!is.na(gene_id)][, tail(.SD,1), by=gene_id]$tx_id
    
    # filter txdb object
    txb <- as.list(txdb)
    txb$transcripts <- txb$transcripts[txb$transcripts$tx_id %in% ltx, ]
    txb$splicings <- txb$splicings[txb$splicings$tx_id %in% ltx,]
    txb$genes <- txb$genes[txb$genes$tx_id %in% ltx,]
    txb <- do.call(makeTxDb, txb)
    
    # plot according to vignette, chapter 2.2.5
    range <- GRanges("chr10", IRanges(start = 78000000 , end = 79000000))
    gr.txdb <- crunch(txb, which = range)
    #> Parsing transcripts...
    #> Parsing exons...
    #> Parsing cds...
    #> Parsing utrs...
    #> ------exons...
    #> ------cdss...
    #> ------introns...
    #> ------utr...
    #> aggregating...
    #> Done
    colnames(values(gr.txdb))[4] <- "model"
    grl <- split(gr.txdb, gr.txdb$gene_id)
    symbols <- select(org.Hs.eg.db, keys=names(grl), columns="SYMBOL", keytype="ENTREZID")
    #> 'select()' returned 1:1 mapping between keys and columns
    names(grl) <- symbols[match(symbols$ENTREZID, names(grl), nomatch=0),"SYMBOL"]
    autoplot(grl, aes(type = "model"), gap.geom="chevron")
    #> Constructing graphics...
    

    Created on 2020-05-29 by the reprex package (v0.3.0)

    Edit: To get gene symbols instead of gene (or transcript) ids, just replace the names of grl with the associated gene symbols, e.g. via org.Hs.eg.db, or any other resource that matches them up.