I am try to split Granges
to specific n
of bins, usually, GenomicRanges::tile
could work for this. However, my Granges
has some gaps, for example:
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("GenomicRanges")
# BiocManager::install("GenomicFeatures")
library(GenomicRanges)
library(GenomicFeatures)
gr <- GRanges(
seqnames = "chr1",
ranges = IRanges(start = c(100, 200, 300), end = c(148, 249, 349)),
strand = "+"
)
GRanges object with 3 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr1 100-148 +
[2] chr1 200-249 +
[3] chr1 300-349 +
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
expected output:
seqnames ranges strand | bin
<Rle> <IRanges> <Rle> | <integer>
[1] chr1 100-109 + | 1
[2] chr1 110-119 + | 2
[3] chr1 120-129 + | 3
[4] chr1 130-139 + | 4
[5] chr1 140-148 + | 5 ## cross gap
[6] chr1 200-200 + | 5
[7] chr1 201-210 + | 6
...
A diagram for this is:
(The middle
l3
is l2
, Its a typo)
I have a customize function to do this, but It runs very slowly. I'm looking forward to seeing if there are any efficient methods.
tile_granges_with_intron <- function(exons, n) {
exon_lengths <- width(exons)
total_length <- sum(exon_lengths)
bin_size <- total_length / n
bin_starts <- floor(seq(1, total_length, by = bin_size))
bin_ends <- c(bin_starts[-1] - 1, total_length)
cum_lengths <- cumsum(exon_lengths)
transcript_starts <- c(1, cum_lengths[-length(cum_lengths)] + 1)
transcript_ends <- cum_lengths
bins <- GRangesList()
for (i in 1:n) {
bin_start <- bin_starts[i]
bin_end <- bin_ends[i]
overlaps <- which(
bin_start <= transcript_ends & bin_end >= transcript_starts
)
bin_ranges <- GRanges()
for (j in overlaps) {
overlap_start <- max(bin_start, transcript_starts[j])
overlap_end <- min(bin_end, transcript_ends[j])
genomic_start <- start(exons[j]) + (overlap_start - transcript_starts[j])
genomic_end <- start(exons[j]) + (overlap_end - transcript_starts[j])
bin_ranges <- c(bin_ranges, GRanges(
seqnames = seqnames(exons[j]),
ranges = IRanges(start = genomic_start, end = genomic_end),
strand = strand(exons[j])
))
}
mcols(bin_ranges)$bin <- i
bins[[i]] <- bin_ranges
}
names(bins) <- paste0("bin", 1:n)
bins <- unlist(bins, use.names = FALSE)
return(bins)
}
If this cannot be accomplished in Grangs
, can it be accomplished in data.table
?
The column strands
here can be ignored because they are always consistent
# we can convert it to data.table
df <- as.data.frame(gr) %>%
data.table::as.data.table()
df
seqnames start end width strand
<fctr> <int> <int> <int> <fctr>
1: chr1 100 148 50 +
2: chr1 200 249 50 +
3: chr1 300 349 50 +
library(GenomicRanges)
library(GenomicFeatures)
gr <- GRanges(
seqnames = "chr1",
ranges = IRanges(start = c(100, 200, 300), end = c(148, 249, 349)),
strand = "+"
)
tile_granges_vectorized <- function(gr, n_bins) {
starts <- start(gr)
widths <- width(gr)
total_length <- sum(widths)
bin_size <- total_length / n_bins
cum_ends <- cumsum(widths)
cum_starts <- c(1, cum_ends[-length(cum_ends)] + 1)
max_results <- n_bins * length(gr)
result_seqnames <- character(max_results)
result_starts <- integer(max_results)
result_ends <- integer(max_results)
result_strands <- character(max_results)
result_bins <- integer(max_results)
result_count <- 0
for(i in 1:n_bins) {
bin_start_cum <- (i - 1) * bin_size + 1
bin_end_cum <- min(i * bin_size, total_length)
overlaps <- which(bin_start_cum <= cum_ends & bin_end_cum >= cum_starts)
if(length(overlaps) > 0) {
overlap_starts_cum <- pmax(bin_start_cum, cum_starts[overlaps])
overlap_ends_cum <- pmin(bin_end_cum, cum_ends[overlaps])
genomic_starts <- starts[overlaps] + (overlap_starts_cum - cum_starts[overlaps])
genomic_ends <- starts[overlaps] + (overlap_ends_cum - cum_starts[overlaps])
n_overlaps <- length(overlaps)
idx_range <- (result_count + 1):(result_count + n_overlaps)
result_seqnames[idx_range] <- as.character(seqnames(gr)[overlaps])
result_starts[idx_range] <- genomic_starts
result_ends[idx_range] <- genomic_ends
result_strands[idx_range] <- as.character(strand(gr)[overlaps])
result_bins[idx_range] <- i
result_count <- result_count + n_overlaps
}
}
if(result_count > 0) {
idx <- 1:result_count
GRanges(
seqnames = result_seqnames[idx],
ranges = IRanges(start = result_starts[idx], end = result_ends[idx]),
strand = result_strands[idx],
bin = result_bins[idx]
)
} else {
GRanges()
}
}
microbenchmark::microbenchmark(
intron = tile_granges_with_intron(gr, 10),
vectorized = tile_granges_vectorized(gr, 10),
times = 10,
check='identical'
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> intron 322.7702 328.6691 341.78269 341.35505 345.6044 377.0622 10 a
#> vectorized 10.0457 10.4513 12.07787 10.97265 13.5882 18.1502 10 b
Created on 2025-06-30 with reprex v2.1.1