rdplyrmedian

How to calculate median using lag/lead in R


I would like to calculate consecutive two-month medians of the chla variable in this dataset. I created an ID key (ID = current month, ID2 = the consecutive month) to help with the calculation. I tried using lead() and median(), but there is something not correct about how I am using the median function. If possible, I would prefer a dplyr approach but am open to any approach.

Note: There are no samples before May or after October.

Dataset:

df <- structure(list(Date = structure(c(1495166400, 1498536000, 1499659200, 
1503288000, 1504843200, 1507003200, 1526270400, 1528862400, 1531800000, 
1533528000, 1536552000, 1540180800, 1558324800, 1560139200, 1562040000, 
1565668800, 1568174400, 1570680000, 1588564800, 1592193600, NA, 
1596427200, 1599537600, 1602043200, 1621224000, 1624334400, 1626753600, 
1629086400, 1631592000, 1634702400, 1652673600, 1656302400, 1658721600, 
1661745600, 1663560000, 1666843200), tzone = "", class = c("POSIXct", 
"POSIXt")), Month = c(5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 
6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 
8, 9, 10), Year = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 2019L, 
2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2022L, 2022L, 2022L, 
2022L, 2022L, 2022L), ID = c("5_2017", "6_2017", "7_2017", "8_2017", 
"9_2017", "10_2017", "5_2018", "6_2018", "7_2018", "8_2018", 
"9_2018", "10_2018", "5_2019", "6_2019", "7_2019", "8_2019", 
"9_2019", "10_2019", "5_2020", "6_2020", "7_2020", "8_2020", 
"9_2020", "10_2020", "5_2021", "6_2021", "7_2021", "8_2021", 
"9_2021", "10_2021", "5_2022", "6_2022", "7_2022", "8_2022", 
"9_2022", "10_2022"), ID2 = c("6_2017", "7_2017", "8_2017", "9_2017", 
"10_2017", "11_2017", "6_2018", "7_2018", "8_2018", "9_2018", 
"10_2018", "11_2018", "6_2019", "7_2019", "8_2019", "9_2019", 
"10_2019", "11_2019", "6_2020", "7_2020", "8_2020", "9_2020", 
"10_2020", "11_2020", "6_2021", "7_2021", "8_2021", "9_2021", 
"10_2021", "11_2021", "6_2022", "7_2022", "8_2022", "9_2022", 
"10_2022", "11_2022"), chla = c(34, 34, 34, 34, 92.5, 156, 34, 
34, 20, 34, 34, 34, 34, 34, 34, 34, 34, 176, 34, 34, NA, 34, 
34, 34, 34, 34, 34, 34, 34, 30, 34, 34, 34, 34, 34, 151)), out.attrs = list(
    dim = c(6L, 6L), dimnames = list(Var1 = c("Var1= 5", "Var1= 6", 
    "Var1= 7", "Var1= 8", "Var1= 9", "Var1=10"), Var2 = c("Var2=2017", 
    "Var2=2018", "Var2=2019", "Var2=2020", "Var2=2021", "Var2=2022"
    ))), class = "data.frame", row.names = c(NA, -36L))

My approach:

Trial <- df %>%
  mutate(Consec_2month_median = 
  case_when(ID2 == lead(ID) ~ median(chla, lead(chla), na.rm = TRUE)))

Desired output:

Desired <- structure(list(Date = structure(c(1495166400, 1498536000, 1499659200, 
1503288000, 1504843200, 1507003200, 1526270400, 1528862400, 1531800000, 
1533528000, 1536552000, 1540180800, 1558324800, 1560139200, 1562040000, 
1565668800, 1568174400, 1570680000, 1588564800, 1592193600, NA, 
1596427200, 1599537600, 1602043200, 1621224000, 1624334400, 1626753600, 
1629086400, 1631592000, 1634702400, 1652673600, 1656302400, 1658721600, 
1661745600, 1663560000, 1666843200), class = c("POSIXct", "POSIXt"
), tzone = ""), Month = c(5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 
5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 
7, 8, 9, 10), Year = c(2017, 2017, 2017, 2017, 2017, 2017, 2018, 
2018, 2018, 2018, 2018, 2018, 2019, 2019, 2019, 2019, 2019, 2019, 
2020, 2020, 2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021, 2021, 
2021, 2022, 2022, 2022, 2022, 2022, 2022), ID = c("5_2017", "6_2017", 
"7_2017", "8_2017", "9_2017", "10_2017", "5_2018", "6_2018", 
"7_2018", "8_2018", "9_2018", "10_2018", "5_2019", "6_2019", 
"7_2019", "8_2019", "9_2019", "10_2019", "5_2020", "6_2020", 
"7_2020", "8_2020", "9_2020", "10_2020", "5_2021", "6_2021", 
"7_2021", "8_2021", "9_2021", "10_2021", "5_2022", "6_2022", 
"7_2022", "8_2022", "9_2022", "10_2022"), ID2 = c("6_2017", "7_2017", 
"8_2017", "9_2017", "10_2017", "11_2017", "6_2018", "7_2018", 
"8_2018", "9_2018", "10_2018", "11_2018", "6_2019", "7_2019", 
"8_2019", "9_2019", "10_2019", "11_2019", "6_2020", "7_2020", 
"8_2020", "9_2020", "10_2020", "11_2020", "6_2021", "7_2021", 
"8_2021", "9_2021", "10_2021", "11_2021", "6_2022", "7_2022", 
"8_2022", "9_2022", "10_2022", "11_2022"), chla = c(34, 34, 34, 
34, 92.5, 156, 34, 34, 20, 34, 34, 34, 34, 34, 34, 34, 34, 176, 
34, 34, NA, 34, 34, 34, 34, 34, 34, 34, 34, 30, 34, 34, 34, 34, 
34, 151), Consec_2month_median = c(34, 34, 34, 63.25, 124.25, 
NA, 34, 27, 27, 34, 34, NA, 34, 34, 34, 34, 105, NA, 34, NA, 
NA, 34, 34, NA, 34, 34, 34, 34, 32, NA, 34, 34, 34, 34, 92.5, 
NA)), row.names = c(NA, -36L), class = "data.frame")

I could live with the single value being reported as the median in instances where there is missing data or the consecutive month falls outside the time frame (e.g. the chla value for month 10 being used instead of an NA due to the absence of month 11 data)


Solution

  • library(dplyr)
    
    left_join(df, df[,c("ID", "chla")], 
                by = c("ID2" = "ID"), suffix = c("", "2")) %>% 
      rowwise() %>% 
      mutate(Consec_2month_median = median(c(chla, chla2))) %>% 
      select(-chla2) 
    
    #> # A tibble: 36 × 7
    #> # Rowwise: 
    #>    Date                Month  Year ID      ID2      chla Consec_2month_median
    #>    <dttm>              <dbl> <int> <chr>   <chr>   <dbl>                <dbl>
    #>  1 2017-05-19 00:00:00     5  2017 5_2017  6_2017   34                   34  
    #>  2 2017-06-27 00:00:00     6  2017 6_2017  7_2017   34                   34  
    #>  3 2017-07-10 00:00:00     7  2017 7_2017  8_2017   34                   34  
    #>  4 2017-08-21 00:00:00     8  2017 8_2017  9_2017   34                   63.2
    #>  5 2017-09-08 00:00:00     9  2017 9_2017  10_2017  92.5                124. 
    #>  6 2017-10-03 00:00:00    10  2017 10_2017 11_2017 156                   NA  
    #>  7 2018-05-14 00:00:00     5  2018 5_2018  6_2018   34                   34  
    #>  8 2018-06-13 00:00:00     6  2018 6_2018  7_2018   34                   27  
    #>  9 2018-07-17 00:00:00     7  2018 7_2018  8_2018   20                   27  
    #> 10 2018-08-06 00:00:00     8  2018 8_2018  9_2018   34                   34  
    #> # ℹ 26 more rows
    

    Created on 2024-10-16 with reprex v2.0.2