rdplyrcountrle

Count maximum consecutive repeated non-NA values grouped by another variable in dataframe R


I would like to determine the maximum count of consecutive repeated non-NA Valor values for each ADM2_PCODE. Therefore, the idea is to group by ADM2_PCODE, filter out the NA values, calculate for each Valor value the highest count of consecutive cases, and select the maximum occurrence between them.

Example dataframe below:

df <- structure(list(Year = c(1981, 1982, 1983, 1984, 1985, 1986, 
                              1981, 1982, 1983, 1984, 1985, 1986,
                              1981, 1982, 1983, 1984, 1985, 1986), ADM2_PCODE = c(1100015, 1100015, 1100015, 1100015, 1100015, 1100015, 
                                                                                  1100016, 1100016, 1100016, 1100016, 1100016, 1100016,
                                                                                  1100017, 1100017, 1100017, 1100017, 1100017, 1100017), 
                     Valor = c(NA, NA, 30, 30, NA, NA,
                               90, 10, 90, 10, 10, 10,
                               30, 20, 30, 40, 30, 60), geometry = c("MULTIPOLYGON (((-62.0495 -1...",
                                                                     "MULTIPOLYGON (((-62.0495 -1...", "MULTIPOLYGON (((-62.0495 -1...",
                                                                     "MULTIPOLYGON (((-62.0495 -1...", "MULTIPOLYGON (((-62.0495 -1...",
                                                                     "MULTIPOLYGON (((-62.0495 -1...", "MULTIPOLYGON (((-63.0495 -1...",
                                                                     "MULTIPOLYGON (((-62.0495 -1...", "MULTIPOLYGON (((-62.0495 -1...",
                                                                     "MULTIPOLYGON (((-62.0495 -1...", "MULTIPOLYGON (((-62.0495 -1...",
                                                                     "MULTIPOLYGON (((-62.0495 -1...", "MULTIPOLYGON (((-63.0495 -1...",
                                                                     "MULTIPOLYGON (((-63.0495 -1...", "MULTIPOLYGON (((-63.0495 -1...",
                                                                     "MULTIPOLYGON (((-63.0495 -1...", "MULTIPOLYGON (((-63.0495 -1...",
                                                                     "MULTIPOLYGON (((-63.0495 -1...")), row.names = c(NA, -18L), class = c("tbl_df", "tbl", "data.frame"))

Input:

 df
# A tibble: 18 x 4
    Year ADM2_PCODE Valor geometry                      
   <dbl>      <dbl> <dbl> <chr>                         
 1  1981    1100015    NA MULTIPOLYGON (((-62.0495 -1...
 2  1982    1100015    NA MULTIPOLYGON (((-62.0495 -1...
 3  1983    1100015    30 MULTIPOLYGON (((-62.0495 -1...
 4  1984    1100015    30 MULTIPOLYGON (((-62.0495 -1...
 5  1985    1100015    NA MULTIPOLYGON (((-62.0495 -1...
 6  1986    1100015    NA MULTIPOLYGON (((-62.0495 -1...
 7  1981    1100016    90 MULTIPOLYGON (((-63.0495 -1...
 8  1982    1100016    10 MULTIPOLYGON (((-62.0495 -1...
 9  1983    1100016    90 MULTIPOLYGON (((-62.0495 -1...
10  1984    1100016    10 MULTIPOLYGON (((-62.0495 -1...
11  1985    1100016    10 MULTIPOLYGON (((-62.0495 -1...
12  1986    1100016    10 MULTIPOLYGON (((-62.0495 -1...
13  1981    1100017    10 MULTIPOLYGON (((-63.0495 -1...
14  1982    1100017    20 MULTIPOLYGON (((-63.0495 -1...
15  1983    1100017    30 MULTIPOLYGON (((-63.0495 -1...
16  1984    1100017    40 MULTIPOLYGON (((-63.0495 -1...
17  1985    1100017    50 MULTIPOLYGON (((-63.0495 -1...
18  1986    1100017    60 MULTIPOLYGON (((-63.0495 -1...

Output expected:

ADM2_PCODE max_consecutive_values 
       <dbl>  <lgl>    
1    1100015 2 
2    1100016 3 
3    1100017 1 

Solution

  • Using data.table rleid to keep track of consecutive values you can do -

    library(dplyr)
    library(data.table)
    
    df %>%
      filter(!is.na(Valor)) %>%
      group_by(ADM2_PCODE) %>%
      mutate(grp = rleid(Valor)) %>%
      count(grp) %>%
      summarise(max_consecutive_values = max(n))
    
    #  ADM2_PCODE max_consecutive_values
    #       <dbl>                  <int>
    #1    1100015                      2
    #2    1100016                      3
    #3    1100017                      1