rdplyrgenomicranges

R - remove specific numeric ranges from a dataframe based on second dataframe and reshape everything


I have a dataframe ('reference') containing the genomic locations ('start' and 'end' columns) for 3 human chromosomes ('seqnames' column). I need to remove specific ranges from each chromosome based on a second dataframe ('blacklist'), and reshape the original dataframe to exclude these ranges from each chromosome. I tried to read these dataframe as a GRanges object (GenomicRanges library), but I didn't found any function to do this specific operation, so maybe it will be easier do this as a data.frame ?

    reference <- data.frame(seqnames = c('chr1','chr2','chr3'),
                            start = c(1,1,1),
                            end = c(248956422, 242193529, 198295559)
                           )

reference 

seqnames    start   end
<chr>   <dbl>   <dbl>
chr1    1   248956422
chr2    1   242193529
chr3    1   198295559

blacklist <- data.frame(seqnames = c('chr1','chr1','chr1','chr2','chr2','chr3','chr3'),
                        start = c(628903, 5850087, 8909610, 10, 23123, 163123, 3163123),
                        end = c(635104, 5850571, 8910014, 9312, 27120, 193120, 3963122)
                        )

blacklist 

seqnames    start   end
<chr>   <dbl>   <dbl>
chr1    628903  635104
chr1    5850087 5850571
chr1    8909610 8910014
chr2    10  9312
chr2    23123   27120
chr3    163123  193120
chr3    3163123 3963122

Desired output (new dataframe 'reference_new'):

reference_new <- data.frame(seqnames = c('chr1','chr1','chr1','chr1','chr2', 'chr2', 'chr2', 'chr3', 'chr3', 'chr3' ),
                            start = c(1, 635105, 5850572,8910015, 1, 9313,27121, 1,193121,3963123 ),
                            end = c(628902, 5850086,8909609, 248956422, 9, 23122,242193529, 163122,3163122, 198295559)
                           )           
reference_new

seqnames    start   end
<chr>   <dbl>   <dbl>
chr1    1   628902
chr1    635105  5850086
chr1    5850572 8909609
chr1    8910015 248956422
chr2    1   9
chr2    9313    23122
chr2    27121   242193529
chr3    1   163122
chr3    193121  3163122
chr3    3963123 198295559

Solution

  • This feels klunky but works for the example data. I didn't do any checking for blacklist values outside the reference range, so I suspect that would cause problems and might merit more checks / different approach.

    library(tidyverse)
    conv_shape <- function(df, rev = 1) {
      df%>%
        pivot_longer(-seqnames) %>%
        mutate(value = value + case_when(rev == -1 & name == "start" ~ -1,
                                         rev == -1 & name == "end" ~ 1,
                                         TRUE ~ 0),
               active = if_else(name == "start", 1, -1) * rev)
    }
    
    bind_rows(.id = "src",
      ref = reference %>% conv_shape,
      blacklist = blacklist %>% conv_shape(-1)
    ) %>%
      arrange(seqnames, value) %>%
      group_by(seqnames) %>%
      mutate(inrng = cumsum(active),
             status = if_else(inrng == 1, "start", "end"),
             grp = cumsum(inrng - lag(inrng, default = 0) == 1)) %>%
      ungroup() %>%
      select(seqnames, value, status, grp) %>%
      pivot_wider(names_from = status, values_from = value)
    

    Result

    # A tibble: 10 × 4
       seqnames   grp   start       end
       <chr>    <int>   <dbl>     <dbl>
     1 chr1         1       1    628902
     2 chr1         2  635105   5850086
     3 chr1         3 5850572   8909609
     4 chr1         4 8910015 248956422
     5 chr2         1       1         9
     6 chr2         2    9313     23122
     7 chr2         3   27121 242193529
     8 chr3         1       1    163122
     9 chr3         2  193121   3163122
    10 chr3         3 3963123 198295559