I have GRanges object (coordinates of all gene exons); coding_pos
defines what is the start position of a codon in a particular exon (1 means that first nucleotide in exon is also the first nt in a codon, and so on).
grTargetGene itself looks like this
> grTargetGene
GRanges object with 11 ranges and 7 metadata columns:
seqnames ranges strand | ensembl_ids gene_biotype prev_exons_length coding_pos
<Rle> <IRanges> <Rle> | <character> <character> <numeric> <numeric>
[1] chr2 [148602722, 148602776] + | ENSG00000121989 protein_coding 0 1
[2] chr2 [148653870, 148654077] + | ENSG00000121989 protein_coding 55 2
[3] chr2 [148657027, 148657136] + | ENSG00000121989 protein_coding 263 3
[4] chr2 [148657313, 148657467] + | ENSG00000121989 protein_coding 373 2
[5] chr2 [148672760, 148672903] + | ENSG00000121989 protein_coding 528 1
[6] chr2 [148674852, 148674995] + | ENSG00000121989 protein_coding 672 1
[7] chr2 [148676016, 148676161] + | ENSG00000121989 protein_coding 816 1
[8] chr2 [148677799, 148677913] + | ENSG00000121989 protein_coding 962 3
[9] chr2 [148680542, 148680680] + | ENSG00000121989 protein_coding 1077 1
[10] chr2 [148683600, 148683730] + | ENSG00000121989 protein_coding 1216 2
[11] chr2 [148684649, 148684843] + | ENSG00000121989 protein_coding 1347 1
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
I am interested in looking at coordinates separately for [1,2] positions in each codon and [3]. In other words, I would like to have 2 different GRanges objects that look approximately like this (here it is only the beginning)
> grTargetGene_Nonsynonym
GRanges object with X ranges and 7 metadata columns:
seqnames ranges strand | ensembl_ids gene_biotype
<Rle> <IRanges> <Rle> | <character> <character>
[1] chr2 [148602722, 148602723] + | ENSG00000121989 protein_coding
[2] chr2 [148602725, 148602726] + | ENSG00000121989 protein_coding
[3] chr2 [148602728, 148602729] + | ENSG00000121989 protein_coding
[4] chr2 [148602731, 148602732] + | ENSG00000121989 protein_coding
> grTargetGene_Synonym
GRanges object with X ranges and 7 metadata columns:
seqnames ranges strand | ensembl_ids gene_biotype
<Rle> <IRanges> <Rle> | <character> <character>
[1] chr2 [148602724, 148602724] + | ENSG00000121989 protein_coding
[2] chr2 [148602727, 148602727] + | ENSG00000121989 protein_coding
[3] chr2 [148602730, 148602730] + | ENSG00000121989 protein_coding
[4] chr2 [148602733, 148602733] + | ENSG00000121989 protein_coding
I was planning to do it through the loop that creates a set of granges for each exon according to coding_pos
and strand
, but I suspect there is a smarter way or maybe even a function that can do it already, but I couldn't find a simple solution.
Important: I do not need the sequence itself (the easiest way, in that case, would be to extract DNA first and then work with the sequence), but instead of doing this I only need the positions which I will use to overlap with some features.
> library("GenomicRanges")
> dput(grTargetGene)
new("GRanges"
, seqnames = new("Rle"
, values = structure(1L, .Label = "chr2", class = "factor")
, lengths = 6L
, elementMetadata = NULL
, metadata = list()
)
, ranges = new("IRanges"
, start = c(148602722L, 148653870L, 148657027L, 148657313L, 148672760L,
148674852L)
, width = c(55L, 208L, 110L, 155L, 144L, 144L)
, NAMES = NULL
, elementType = "integer"
, elementMetadata = NULL
, metadata = list()
)
, strand = new("Rle"
, values = structure(1L, .Label = c("+", "-", "*"), class = "factor")
, lengths = 6L
, elementMetadata = NULL
, metadata = list()
)
, elementMetadata = new("DataFrame"
, rownames = NULL
, nrows = 6L
, listData = structure(list(ensembl_ids =
c("ENSG00000121989","ENSG00000121989",
"ENSG00000121989", "ENSG00000121989", "ENSG00000121989", "ENSG00000121989"
), gene_biotype = c("protein_coding", "protein_coding", "protein_coding",
"protein_coding", "protein_coding", "protein_coding"), cds_length =
c(1542,1542, 1542, 1542, 1542, 1542), gene_start_position = c(148602086L,
148602086L, 148602086L, 148602086L, 148602086L, 148602086L),
gene_end_position = c(148688393L, 148688393L, 148688393L,
148688393L, 148688393L, 148688393L), prev_exons_length = c(0,
55, 263, 373, 528, 672), coding_pos = c(1, 2, 3, 2, 1, 1)), .Names =
c("ensembl_ids", "gene_biotype", "cds_length", "gene_start_position",
"gene_end_position",
"prev_exons_length", "coding_pos"))
, elementType = "ANY"
, elementMetadata = NULL
, metadata = list()
)
, seqinfo = new("Seqinfo"
, seqnames = "chr2"
, seqlengths = NA_integer_
, is_circular = NA
, genome = NA_character_
)
, metadata = list()
)
I created a function that can account for a chain and allows to process exons that length is not divisible by 3 (and might be even less than 3)
CodonPosition_separation = function(grTargetGene) {
grTargetGene = sort(grTargetGene)
grTargetGene$prev_exons_length = c(0,width(grTargetGene)[1:length(grTargetGene)-1])
if (length(grTargetGene) >1) {
for (l in 2:length(grTargetGene)) {
grTargetGene$prev_exons_length[l] = grTargetGene$prev_exons_length[l]+grTargetGene$prev_exons_length[l-1]
}
}
grTargetGene$coding_pos = grTargetGene$prev_exons_length%%3+1
grTargetGene_N = GRanges()
grTargetGene_S = GRanges()
for (l in 1:length(grTargetGene)) {
for (obj in c("start_nonsyn","start_syn", "end_nonsyn", "end_syn","gr_nonsyn","gr_syn")) {if(exists(obj)) {rm(obj)}}
if (as.character(strand(grTargetGene)[1]) =="+"){
start_ns = start(grTargetGene[l])+1-grTargetGene$coding_pos[l]
end_ns = end(grTargetGene[l])
if (start_ns <=end_ns) {
start_nonsyn = seq(from = start(grTargetGene[l])+1-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
end_nonsyn = seq(from = start(grTargetGene[l])+2-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
}
start_s =start(grTargetGene[l])+3-grTargetGene$coding_pos[l]
end_s = end(grTargetGene[l])
if (start_s <=end_s) {
start_syn = seq(from = start(grTargetGene[l])+3-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
end_syn = start_syn
}
} else {
start_ns = end(grTargetGene[l])-1+grTargetGene$coding_pos[l]
end_ns = start(grTargetGene[l])
if (start_ns >=end_ns) {
start_nonsyn = seq(from = end(grTargetGene[l])-1+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
end_nonsyn = seq(from = end(grTargetGene[l])-2+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
}
start_s =end(grTargetGene[l])-3+grTargetGene$coding_pos[l]
end_s = start(grTargetGene[l])
if (start_ns >=end_ns) {
start_syn = seq(from = end(grTargetGene[l])-3+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
end_syn = start_syn
}
}
if (exists("start_nonsyn")) {
length_nonsyn = length(start_nonsyn)+ length(end_nonsyn)
gr_nonsyn = GRanges(
seqnames = rep(seqnames(grTargetGene[l]), length_nonsyn),
strand = rep(strand(grTargetGene[l]), length_nonsyn),
ranges = IRanges(start = c(start_nonsyn, end_nonsyn), end = c(start_nonsyn, end_nonsyn))
)
gr_nonsyn = intersect(gr_nonsyn,grTargetGene[l])
grTargetGene_N = append(grTargetGene_N, gr_nonsyn)
}
if (exists("start_syn")) {
length_syn = length(start_syn)
gr_syn = GRanges(
seqnames = rep(seqnames(grTargetGene[l]), length_syn),
strand = rep(strand(grTargetGene[l]), length_syn),
ranges = IRanges(start = start_syn, end = end_syn)
)
gr_syn = intersect(gr_syn,grTargetGene[l])
grTargetGene_S = append(grTargetGene_S, gr_syn)
}
}
return(list("grTargetGene_S"=grTargetGene_S,"grTargetGene_N"=grTargetGene_N))
}
It works nicely:
> CodonPosition_separation(grTargetGene)
$grTargetGene_S
GRanges object with 514 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr2 [148602724, 148602724] +
[2] chr2 [148602727, 148602727] +
[3] chr2 [148602730, 148602730] +
[4] chr2 [148602733, 148602733] +
[5] chr2 [148602736, 148602736] +
... ... ... ...
[510] chr2 [148684831, 148684831] +
[511] chr2 [148684834, 148684834] +
[512] chr2 [148684837, 148684837] +
[513] chr2 [148684840, 148684840] +
[514] chr2 [148684843, 148684843] +
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
$grTargetGene_N
GRanges object with 517 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr2 [148602722, 148602723] +
[2] chr2 [148602725, 148602726] +
[3] chr2 [148602728, 148602729] +
[4] chr2 [148602731, 148602732] +
[5] chr2 [148602734, 148602735] +
... ... ... ...
[513] chr2 [148684829, 148684830] +
[514] chr2 [148684832, 148684833] +
[515] chr2 [148684835, 148684836] +
[516] chr2 [148684838, 148684839] +
[517] chr2 [148684841, 148684842] +
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths