rlongitudinal

Finding observations occuring more than 3 times in longitudinal data


I have the following data:

structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L), A = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 
1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), Day = c(1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
17L)), row.names = c(NA, 48L), class = "data.frame")

I want to find IDs who have >3 observations for A in a row, and create the following data:

structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L), A = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 
1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), Day = c(1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
17L), Censor = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 1L, 1L, 1L), Day_2 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L)), row.names = c(NA, 
48L), class = "data.frame")

Where Censor starts from the first Day that an ID has >3 observations for A in a row, and Day_2 is the day that first observation occurred.


Solution

  • Wrapping an rle approach in a function that identifies those subsequent 1s with length > 3, and then using it in ave.

    f <- \(x) with(rle(x), rep.int(replace(numeric(length(values)),
                                           which(values == 1 & lengths > 3), 1), lengths))
    
    res <- within(dat, {
      censor <- ave(A, ID, FUN=f)
      Day_2 <- ave(censor, ID, FUN=\(x) if (sum(x) != 0) which.max(x) else NA_integer_)
    })
    res
    #    ID A Day Day_2 censor
    # 1   1 0   1    NA      0
    # 2   1 0   2    NA      0
    # 3   1 0   3    NA      0
    # 4   1 0   4    NA      0
    # 5   1 0   5    NA      0
    # 6   1 1   6    NA      0
    # 7   1 1   7    NA      0
    # 8   1 0   8    NA      0
    # 9   1 0   9    NA      0
    # 10  1 0  10    NA      0
    # 11  1 0  11    NA      0
    # 12  1 1  12    NA      0
    # 13  1 1  13    NA      0
    # 14  1 1  14    NA      0
    # 15  1 0  15    NA      0
    # 16  1 0  16    NA      0
    # 17  1 0  17    NA      0
    # 18  2 0   1     5      0
    # 19  2 0   2     5      0
    # 20  2 0   3     5      0
    # 21  2 0   4     5      0
    # 22  2 1   5     5      1
    # 23  2 1   6     5      1
    # 24  2 1   7     5      1
    # 25  2 1   8     5      1
    # 26  2 1   9     5      1
    # 27  2 1  10     5      1
    # 28  2 1  11     5      1
    # 29  2 1  12     5      1
    # 30  2 1  13     5      1
    # 31  2 1  14     5      1
    # 32  3 0   1    14      0
    # 33  3 0   2    14      0
    # 34  3 0   3    14      0
    # 35  3 0   4    14      0
    # 36  3 0   5    14      0
    # 37  3 1   6    14      0
    # 38  3 1   7    14      0
    # 39  3 1   8    14      0
    # 40  3 0   9    14      0
    # 41  3 1  10    14      0
    # 42  3 0  11    14      0
    # 43  3 0  12    14      0
    # 44  3 0  13    14      0
    # 45  3 1  14    14      1
    # 46  3 1  15    14      1
    # 47  3 1  16    14      1
    # 48  3 1  17    14      1
    

    Data:

    dat <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L), A = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 
    0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 
    1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), Day = c(1L, 2L, 3L, 4L, 5L, 
    6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 2L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
    17L)), row.names = c(NA, 48L), class = "data.frame")