rtime-seriesshuffleoverlap

How can I shuffle time blocks in time series without overlap in R?


Let's assume there is a group of 3 persons, for which I have a time series of when they start and finish an activity. An example dataframe would be:

library(tidyverse)
GrXX <- tibble(Individual = rep(c("A", "B", "C"), times = c(2, 3, 5)),
               Frame_beginning = c(1, 16, 7, 21, 29, 3, 9, 12, 19, 27),
               Frame_end = c(3, 22, 15, 24, 30, 7, 10, 12, 24, 30),
               Duration = Frame_end - Frame_beginning + 1,
               Frame_end_valid_group = 30)

This group can be represented by the following figure:enter image description here

I would love to shuffle this group around, so that the time blocks within individuals are randomly placed, without overlap within individuals (there can be overlap between individuals). Additonally, all original time blocks should also be fully on the shuffled timeline (i.e., they should not start or finish outside of the original timeline). One potential shuffling could be the following: enter image description here

My question is how to do that in R, in an efficient way. I started some code, but it seems very complex for the task, and does not prevent all overlaps. Here it is:

GrXX_shuffled <- tibble()

for (i in c("A", "B", "C")) {
  temp <- GrXX %>%
    filter(Individual == i) %>%
    arrange(-Duration, Frame_beginning)
  
  valid_frames <- c()
  Frame_shuffled_start <- c()
  Frame_shuffled_end <- c()
  Frame_earliest <- 1
  
  for (j in 1:nrow(temp)) {
    temp2 <- temp[j, ]
    
    if(j == 1) {
    valid_frames <- 1:(temp2$Frame_end_valid_group - temp2$Duration + 1)
    
    temp2 <- temp2 %>%
      mutate(Frame_beginning_shuffled = sample(valid_frames, size = 1), .after = Frame_beginning) %>%
      mutate(Frame_end_shuffled = Frame_beginning_shuffled + Duration - 1, .after = Frame_end)
    
    GrXX_shuffled <- bind_rows(GrXX_shuffled, temp2)
    }

    if(j != 1) {
      valid_frames <- c(Frame_earliest:(Frame_shuffled_start[j-1] - temp2$Duration - 1),
                        (Frame_shuffled_end[j-1] + 2):(temp2$Frame_end_valid_group - temp2$Duration + 1))
      
      valid_frames <- valid_frames[valid_frames >= 1 & valid_frames <= 30]
      
      temp2 <- temp2 %>%
        mutate(Frame_beginning_shuffled = sample(valid_frames, size = 1), .after = Frame_beginning) %>%
        mutate(Frame_end_shuffled = Frame_beginning_shuffled + Duration - 1, .after = Frame_end)
      
      GrXX_shuffled <- bind_rows(GrXX_shuffled, temp2)
    }
    
    Frame_shuffled_start <- sort(c(Frame_shuffled_start, temp2$Frame_beginning_shuffled))
    Frame_shuffled_end <- sort(c(Frame_shuffled_end, temp2$Frame_end_shuffled))
    Frame_earliest <- min(valid_frames_2)
  }
}

The idea behind the code is to:

Any idea how to solve this efficiently?

Many thanks in advance :-)


Solution

  • Here's an efficient function to do what was described. It assumes discrete timesteps as in the OP's example.

    library(data.table)
    library(RcppAlgos) # for `compositionsSample`
    library(Rfast) # for `colShuffle` and `colCumSums`
    
    shuffle <- function(dt, n = 1L) {
      f <- function(v, m) {
        k <- length(v)
        k1 <- k + 1L
        as.data.table(
          cbind(
            rep = rep(1:n, each = k),
            matrix(
              colCumSums(
                `dim<-`(
                  rbind(
                    `dim<-`(t(compositionsSample(0:(m - sum(v)), k + 1, TRUE,
                                                 n = n)[,-k - 1]), NULL),
                    `dim<-`(colShuffle(matrix(v, k, n)), NULL)
                  ), c(2*k, n)
                )
              ) + 1:0,
              n*k, 2, TRUE, list(NULL, c("Frame_beginning", "Frame_end"))
            )
          )
        )[,`:=`(Duration = Frame_end - Frame_beginning + 1,
                Frame_end_valid_group = m)]
      }
      setorder(
        setcolorder(dt[,f(Duration, Frame_end_valid_group[1]), Individual], "rep"),
        rep, Individual
      )
    }
    

    Demonstrating:

    shuffle(setDT(GrXX))[]
    #>       rep Individual Frame_beginning Frame_end Duration Frame_end_valid_group
    #>     <num>     <char>           <num>     <num>    <num>                 <num>
    #>  1:     1          A               3         9        7                    30
    #>  2:     1          A              27        29        3                    30
    #>  3:     1          B               2         3        2                    30
    #>  4:     1          B               5         8        4                    30
    #>  5:     1          B              21        29        9                    30
    #>  6:     1          C               1         5        5                    30
    #>  7:     1          C               8         8        1                    30
    #>  8:     1          C              12        13        2                    30
    #>  9:     1          C              17        22        6                    30
    #> 10:     1          C              26        29        4                    30
    

    It can also do multiple shuffles using the n argument:

    shuffle(GrXX, 3)[]
    #>       rep Individual Frame_beginning Frame_end Duration Frame_end_valid_group
    #>     <num>     <char>           <num>     <num>    <num>                 <num>
    #>  1:     1          A              17        23        7                    30
    #>  2:     1          A              25        27        3                    30
    #>  3:     1          B               5         8        4                    30
    #>  4:     1          B              16        17        2                    30
    #>  5:     1          B              19        27        9                    30
    #>  6:     1          C               4         4        1                    30
    #>  7:     1          C               6         9        4                    30
    #>  8:     1          C              11        12        2                    30
    #>  9:     1          C              15        20        6                    30
    #> 10:     1          C              25        29        5                    30
    #> 11:     2          A               8        10        3                    30
    #> 12:     2          A              19        25        7                    30
    #> 13:     2          B               5         6        2                    30
    #> 14:     2          B               8        16        9                    30
    #> 15:     2          B              20        23        4                    30
    #> 16:     2          C               3         7        5                    30
    #> 17:     2          C              11        12        2                    30
    #> 18:     2          C              14        17        4                    30
    #> 19:     2          C              19        19        1                    30
    #> 20:     2          C              21        26        6                    30
    #> 21:     3          A              12        14        3                    30
    #> 22:     3          A              23        29        7                    30
    #> 23:     3          B               4         5        2                    30
    #> 24:     3          B              10        13        4                    30
    #> 25:     3          B              15        23        9                    30
    #> 26:     3          C               2         7        6                    30
    #> 27:     3          C               9        10        2                    30
    #> 28:     3          C              12        16        5                    30
    #> 29:     3          C              19        19        1                    30
    #> 30:     3          C              23        26        4                    30
    

    And it's fast:

    microbenchmark::microbenchmark(shuffle(GrXX, 100))
    #> Unit: milliseconds
    #>                expr    min     lq     mean  median      uq    max neval
    #>  shuffle(GrXX, 100) 1.9616 2.1486 2.343154 2.22235 2.36755 5.8275   100