rbioconductorgenomicranges

Get ranges for synonymous and non-synonymous nucleotide positions within a codon separately


I have GRanges object (coordinates of all gene exons); coding_pos defines what is the start position of a codon in a particular exon (1 means that first nucleotide in exon is also the first nt in a codon, and so on).

grTargetGene itself looks like this

> grTargetGene

GRanges object with 11 ranges and 7 metadata columns:
   seqnames                 ranges strand |     ensembl_ids   gene_biotype prev_exons_length coding_pos
      <Rle>              <IRanges>  <Rle> |     <character>    <character>         <numeric>  <numeric>
   [1]     chr2 [148602722, 148602776]      + | ENSG00000121989 protein_coding       0           1
   [2]     chr2 [148653870, 148654077]      + | ENSG00000121989 protein_coding       55          2
   [3]     chr2 [148657027, 148657136]      + | ENSG00000121989 protein_coding       263         3
   [4]     chr2 [148657313, 148657467]      + | ENSG00000121989 protein_coding       373         2
   [5]     chr2 [148672760, 148672903]      + | ENSG00000121989 protein_coding       528         1
   [6]     chr2 [148674852, 148674995]      + | ENSG00000121989 protein_coding       672         1
   [7]     chr2 [148676016, 148676161]      + | ENSG00000121989 protein_coding       816         1
   [8]     chr2 [148677799, 148677913]      + | ENSG00000121989 protein_coding       962         3
   [9]     chr2 [148680542, 148680680]      + | ENSG00000121989 protein_coding       1077        1
  [10]     chr2 [148683600, 148683730]      + | ENSG00000121989 protein_coding       1216        2
  [11]     chr2 [148684649, 148684843]      + | ENSG00000121989 protein_coding       1347        1
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

I am interested in looking at coordinates separately for [1,2] positions in each codon and [3]. In other words, I would like to have 2 different GRanges objects that look approximately like this (here it is only the beginning)

> grTargetGene_Nonsynonym

GRanges object with X ranges and 7 metadata columns:
   seqnames                 ranges strand |     ensembl_ids   gene_biotype 
      <Rle>              <IRanges>  <Rle> |     <character>    <character> 
   [1]     chr2 [148602722, 148602723]      + | ENSG00000121989 protein_coding
   [2]     chr2 [148602725, 148602726]      + | ENSG00000121989 protein_coding
   [3]     chr2 [148602728, 148602729]      + | ENSG00000121989 protein_coding
   [4]     chr2 [148602731, 148602732]      + | ENSG00000121989 protein_coding



> grTargetGene_Synonym

GRanges object with X ranges and 7 metadata columns:
   seqnames                 ranges strand |     ensembl_ids   gene_biotype 
      <Rle>              <IRanges>  <Rle> |     <character>    <character> 
   [1]     chr2 [148602724, 148602724]      + | ENSG00000121989 protein_coding
   [2]     chr2 [148602727, 148602727]      + | ENSG00000121989 protein_coding
   [3]     chr2 [148602730, 148602730]      + | ENSG00000121989 protein_coding
   [4]     chr2 [148602733, 148602733]      + | ENSG00000121989 protein_coding

I was planning to do it through the loop that creates a set of granges for each exon according to coding_pos and strand, but I suspect there is a smarter way or maybe even a function that can do it already, but I couldn't find a simple solution.

Important: I do not need the sequence itself (the easiest way, in that case, would be to extract DNA first and then work with the sequence), but instead of doing this I only need the positions which I will use to overlap with some features.

> library("GenomicRanges")
> dput(grTargetGene)

new("GRanges"
, seqnames = new("Rle"
, values = structure(1L, .Label = "chr2", class = "factor")
, lengths = 6L
, elementMetadata = NULL
, metadata = list()
)
, ranges = new("IRanges"
, start = c(148602722L, 148653870L, 148657027L, 148657313L, 148672760L, 
148674852L)
, width = c(55L, 208L, 110L, 155L, 144L, 144L)
, NAMES = NULL
, elementType = "integer"
, elementMetadata = NULL
, metadata = list()
)
, strand = new("Rle"
, values = structure(1L, .Label = c("+", "-", "*"), class = "factor")
, lengths = 6L
, elementMetadata = NULL
, metadata = list()
)
, elementMetadata = new("DataFrame"
, rownames = NULL
, nrows = 6L
, listData = structure(list(ensembl_ids =
c("ENSG00000121989","ENSG00000121989", 
"ENSG00000121989", "ENSG00000121989", "ENSG00000121989", "ENSG00000121989"
), gene_biotype = c("protein_coding", "protein_coding", "protein_coding", 
"protein_coding", "protein_coding", "protein_coding"), cds_length =   
c(1542,1542, 1542, 1542, 1542, 1542), gene_start_position = c(148602086L, 
148602086L, 148602086L, 148602086L, 148602086L, 148602086L), 
gene_end_position = c(148688393L, 148688393L, 148688393L, 
148688393L, 148688393L, 148688393L), prev_exons_length = c(0, 
55, 263, 373, 528, 672), coding_pos = c(1, 2, 3, 2, 1, 1)), .Names =  
c("ensembl_ids", "gene_biotype", "cds_length", "gene_start_position",
"gene_end_position", 
"prev_exons_length", "coding_pos"))
, elementType = "ANY"
, elementMetadata = NULL
, metadata = list()
)
, seqinfo = new("Seqinfo"
, seqnames = "chr2"
, seqlengths = NA_integer_
, is_circular = NA
, genome = NA_character_
)
, metadata = list()
)

Solution

  • I created a function that can account for a chain and allows to process exons that length is not divisible by 3 (and might be even less than 3)

    CodonPosition_separation = function(grTargetGene) {
        grTargetGene = sort(grTargetGene)
        grTargetGene$prev_exons_length = c(0,width(grTargetGene)[1:length(grTargetGene)-1])
        if (length(grTargetGene) >1) {
            for (l in 2:length(grTargetGene)) {
              grTargetGene$prev_exons_length[l] = grTargetGene$prev_exons_length[l]+grTargetGene$prev_exons_length[l-1]
            }
          }
      grTargetGene$coding_pos =  grTargetGene$prev_exons_length%%3+1
      grTargetGene_N =  GRanges()
      grTargetGene_S =  GRanges()
      for (l in 1:length(grTargetGene)) {
        for (obj in c("start_nonsyn","start_syn", "end_nonsyn", "end_syn","gr_nonsyn","gr_syn")) {if(exists(obj)) {rm(obj)}}
        if (as.character(strand(grTargetGene)[1]) =="+"){
          start_ns = start(grTargetGene[l])+1-grTargetGene$coding_pos[l]
          end_ns = end(grTargetGene[l])
          if (start_ns <=end_ns) {
            start_nonsyn = seq(from = start(grTargetGene[l])+1-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
            end_nonsyn = seq(from = start(grTargetGene[l])+2-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
          }
          start_s =start(grTargetGene[l])+3-grTargetGene$coding_pos[l]
          end_s = end(grTargetGene[l])
          if (start_s <=end_s) {
            start_syn = seq(from = start(grTargetGene[l])+3-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
            end_syn = start_syn
          }
    
        } else {
          start_ns = end(grTargetGene[l])-1+grTargetGene$coding_pos[l]
          end_ns = start(grTargetGene[l])
          if (start_ns >=end_ns) {
            start_nonsyn = seq(from = end(grTargetGene[l])-1+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
            end_nonsyn = seq(from = end(grTargetGene[l])-2+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
          }
          start_s =end(grTargetGene[l])-3+grTargetGene$coding_pos[l]
          end_s = start(grTargetGene[l])
          if (start_ns >=end_ns) {
            start_syn = seq(from = end(grTargetGene[l])-3+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
            end_syn = start_syn
          }
        }
        if (exists("start_nonsyn")) {
          length_nonsyn = length(start_nonsyn)+ length(end_nonsyn)
          gr_nonsyn = GRanges(
            seqnames = rep(seqnames(grTargetGene[l]), length_nonsyn),
            strand = rep(strand(grTargetGene[l]), length_nonsyn),
            ranges = IRanges(start = c(start_nonsyn, end_nonsyn), end = c(start_nonsyn, end_nonsyn))
          )
          gr_nonsyn = intersect(gr_nonsyn,grTargetGene[l])
          grTargetGene_N = append(grTargetGene_N, gr_nonsyn)
          } 
        if (exists("start_syn")) {
          length_syn = length(start_syn)
          gr_syn = GRanges(
            seqnames = rep(seqnames(grTargetGene[l]), length_syn),
            strand = rep(strand(grTargetGene[l]), length_syn),
            ranges = IRanges(start = start_syn, end = end_syn)
          )
          gr_syn = intersect(gr_syn,grTargetGene[l])
          grTargetGene_S = append(grTargetGene_S, gr_syn)
        }
      }
      return(list("grTargetGene_S"=grTargetGene_S,"grTargetGene_N"=grTargetGene_N))
    }
    

    It works nicely:

    > CodonPosition_separation(grTargetGene)
    $grTargetGene_S
    GRanges object with 514 ranges and 0 metadata columns:
            seqnames                 ranges strand
               <Rle>              <IRanges>  <Rle>
        [1]     chr2 [148602724, 148602724]      +
        [2]     chr2 [148602727, 148602727]      +
        [3]     chr2 [148602730, 148602730]      +
        [4]     chr2 [148602733, 148602733]      +
        [5]     chr2 [148602736, 148602736]      +
        ...      ...                    ...    ...
      [510]     chr2 [148684831, 148684831]      +
      [511]     chr2 [148684834, 148684834]      +
      [512]     chr2 [148684837, 148684837]      +
      [513]     chr2 [148684840, 148684840]      +
      [514]     chr2 [148684843, 148684843]      +
      -------
      seqinfo: 1 sequence from an unspecified genome; no seqlengths
    
    $grTargetGene_N
    GRanges object with 517 ranges and 0 metadata columns:
            seqnames                 ranges strand
               <Rle>              <IRanges>  <Rle>
        [1]     chr2 [148602722, 148602723]      +
        [2]     chr2 [148602725, 148602726]      +
        [3]     chr2 [148602728, 148602729]      +
        [4]     chr2 [148602731, 148602732]      +
        [5]     chr2 [148602734, 148602735]      +
        ...      ...                    ...    ...
      [513]     chr2 [148684829, 148684830]      +
      [514]     chr2 [148684832, 148684833]      +
      [515]     chr2 [148684835, 148684836]      +
      [516]     chr2 [148684838, 148684839]      +
      [517]     chr2 [148684841, 148684842]      +
      -------
      seqinfo: 1 sequence from an unspecified genome; no seqlengths