rdataframetidyrplyr

automatizing ranges with R


I am trying to automatize the calculation of some ranges that have to be integrated later on. I am doing it without using tidyR, but I believe that applying tidy could help solve the last bottle neck of my problem My initial table looks like this:

int_NMR <- data.frame("Component" = c("A", "B", "C", "D", "E",
                                      "F", "G", "H"),
                 "From" = c(0.0, 45.0, 60.0, 95.0, 110.0, 145.0, 165.0, 190), 
                  "To" = c(45.0, 60.0, 95.0, 110.0, 145.0, 165.0, 190.0, 215.0))

to the "From" and "To" values I have to add or substract an X value (for example 160). However, this implies that some ranges will overlap and then need to be sliced and the column "component" be kept for both slices.

The end result would look something like this:

enter image description here

As you can see some have been sliced and there is an overlap that needs to be mantained, for instance the component A, which initially goes from 0 to 45, now is going from 0 to 5, from 5 to 30 and from 30 to 45, and overlaping with components F_low, G_low and H_low.

The approach that I have followed is to bind tables with NAs on them but when I arrive to the last table, the overlaping ranges can not be sliced without me loosing the information of which component it belongs.

Below my code

na_frame <- NULL
int_NMR_all<- NULL

int_NMR_high <- data.frame(setNames(lapply(int_NMR[1], function(x) paste("ssb_high", x, sep="_")),"Component_ssb"),int_NMR[2:3]+sb_ofset)

int_NMR_low <- data.frame(setNames(lapply(int_NMR[1], function(x) paste("ssb_low", x, sep="_")),"Component_ssb"),int_NMR[2:3]-sb_ofset)

int_NMR_all <- rbind(int_NMR_low,int_NMR_high)

na_frame <- as.data.frame(matrix(NA, nrow = nrow(int_NMR), ncol = 3))
names(na_frame) <- names(int_NMR_all)
int_NMR_all <- rbind(int_NMR_all, na_frame)

na_frame <- as.data.frame(matrix(NA, nrow = nrow(int_NMR_all), ncol = 1))
names(na_frame) <- c("Component")
int_NMR_all<- cbind(int_NMR_all, na_frame)

na_frame <- as.data.frame(matrix(NA, nrow = nrow(int_NMR), ncol = 1))
names(na_frame) <- c("Component_ssb")
int_NMR<- cbind(int_NMR, na_frame)

na_frame <- as.data.frame(matrix(NA, nrow = 2*nrow(int_NMR), ncol = 4))

names(na_frame) <- names(int_NMR_all)

int_NMR <- rbind(int_NMR, na_frame)

int_NMR_all <- rbind(int_NMR,int_NMR_all)

int_NMR_all <- int_NMR_all[!is.na(int_NMR_all$From),] 

Solution

  • Here is a data.table approach, also using the intervalSurgeon-package for finding unique intervals.

    Probably not thje most efficient way, but output looks as desired..

    library(data.table)
    library(IntervalSurgeon)
    value_n <- 160
    # comvert to data.table format
    setDT(int_NMR)
    # create high and low intervals
    int_NMR_high <- copy(int_NMR)[, `:=`(From = From + value_n, To = To + value_n, Component_mod = paste0(Component, "_High"))]
    int_NMR_low <- copy(int_NMR)[, `:=`(From = From - value_n, To = To - value_n, Component_mod = paste0(Component, "_Low"))]
    # create one data.table of all intervals
    all_int <- rbindlist(list(int_NMR_low, int_NMR, int_NMR_high), fill = TRUE)
    # create a new data.table with all the non-overlapping intervals
    final <- data.table(sections(breaks(as.matrix(all_int[, 2:3]))))
    setnames(final, c("From", "To"))
    # perform overlap joins
    final[all_int[is.na(Component_mod), ], Component := i.Component, on = .(From < To, To > From )]
    final[all_int[!is.na(Component_mod), ], Component_mod := i.Component_mod, on = .(From < To, To > From )]
    

    final

    #     From   To Component Component_mod
    #  1: -160 -115      <NA>         A_Low
    #  2: -115 -100      <NA>         B_Low
    #  3: -100  -65      <NA>         C_Low
    #  4:  -65  -50      <NA>         D_Low
    #  5:  -50  -15      <NA>         E_Low
    #  6:  -15    0      <NA>         F_Low
    #  7:    0    5         A         F_Low
    #  8:    5   30         A         G_Low
    #  9:   30   45         A         H_Low
    # 10:   45   55         B         H_Low
    # 11:   55   60         B          <NA>
    # 12:   60   95         C          <NA>
    # 13:   95  110         D          <NA>
    # 14:  110  145         E          <NA>
    # 15:  145  160         F          <NA>
    # 16:  160  165         F        A_High
    # 17:  165  190         G        A_High
    # 18:  190  205         H        A_High
    # 19:  205  215         H        B_High
    # 20:  215  220      <NA>        B_High
    # 21:  220  255      <NA>        C_High
    # 22:  255  270      <NA>        D_High
    # 23:  270  305      <NA>        E_High
    # 24:  305  325      <NA>        F_High
    # 25:  325  350      <NA>        G_High
    # 26:  350  375      <NA>        H_High
    #     From   To Component Component_mod