I would like to calculate consecutive two-month medians of the chla variable in this dataset. I created an ID key (ID = current month, ID2 = the consecutive month) to help with the calculation. I tried using lead()
and median()
, but there is something not correct about how I am using the median function. If possible, I would prefer a dplyr approach but am open to any approach.
Note: There are no samples before May or after October.
Dataset:
df <- structure(list(Date = structure(c(1495166400, 1498536000, 1499659200,
1503288000, 1504843200, 1507003200, 1526270400, 1528862400, 1531800000,
1533528000, 1536552000, 1540180800, 1558324800, 1560139200, 1562040000,
1565668800, 1568174400, 1570680000, 1588564800, 1592193600, NA,
1596427200, 1599537600, 1602043200, 1621224000, 1624334400, 1626753600,
1629086400, 1631592000, 1634702400, 1652673600, 1656302400, 1658721600,
1661745600, 1663560000, 1666843200), tzone = "", class = c("POSIXct",
"POSIXt")), Month = c(5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5,
6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7,
8, 9, 10), Year = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 2019L,
2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L,
2021L, 2021L, 2021L, 2021L, 2021L, 2021L, 2022L, 2022L, 2022L,
2022L, 2022L, 2022L), ID = c("5_2017", "6_2017", "7_2017", "8_2017",
"9_2017", "10_2017", "5_2018", "6_2018", "7_2018", "8_2018",
"9_2018", "10_2018", "5_2019", "6_2019", "7_2019", "8_2019",
"9_2019", "10_2019", "5_2020", "6_2020", "7_2020", "8_2020",
"9_2020", "10_2020", "5_2021", "6_2021", "7_2021", "8_2021",
"9_2021", "10_2021", "5_2022", "6_2022", "7_2022", "8_2022",
"9_2022", "10_2022"), ID2 = c("6_2017", "7_2017", "8_2017", "9_2017",
"10_2017", "11_2017", "6_2018", "7_2018", "8_2018", "9_2018",
"10_2018", "11_2018", "6_2019", "7_2019", "8_2019", "9_2019",
"10_2019", "11_2019", "6_2020", "7_2020", "8_2020", "9_2020",
"10_2020", "11_2020", "6_2021", "7_2021", "8_2021", "9_2021",
"10_2021", "11_2021", "6_2022", "7_2022", "8_2022", "9_2022",
"10_2022", "11_2022"), chla = c(34, 34, 34, 34, 92.5, 156, 34,
34, 20, 34, 34, 34, 34, 34, 34, 34, 34, 176, 34, 34, NA, 34,
34, 34, 34, 34, 34, 34, 34, 30, 34, 34, 34, 34, 34, 151)), out.attrs = list(
dim = c(6L, 6L), dimnames = list(Var1 = c("Var1= 5", "Var1= 6",
"Var1= 7", "Var1= 8", "Var1= 9", "Var1=10"), Var2 = c("Var2=2017",
"Var2=2018", "Var2=2019", "Var2=2020", "Var2=2021", "Var2=2022"
))), class = "data.frame", row.names = c(NA, -36L))
My approach:
Trial <- df %>%
mutate(Consec_2month_median =
case_when(ID2 == lead(ID) ~ median(chla, lead(chla), na.rm = TRUE)))
Desired output:
Desired <- structure(list(Date = structure(c(1495166400, 1498536000, 1499659200,
1503288000, 1504843200, 1507003200, 1526270400, 1528862400, 1531800000,
1533528000, 1536552000, 1540180800, 1558324800, 1560139200, 1562040000,
1565668800, 1568174400, 1570680000, 1588564800, 1592193600, NA,
1596427200, 1599537600, 1602043200, 1621224000, 1624334400, 1626753600,
1629086400, 1631592000, 1634702400, 1652673600, 1656302400, 1658721600,
1661745600, 1663560000, 1666843200), class = c("POSIXct", "POSIXt"
), tzone = ""), Month = c(5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10,
5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6, 7, 8, 9, 10, 5, 6,
7, 8, 9, 10), Year = c(2017, 2017, 2017, 2017, 2017, 2017, 2018,
2018, 2018, 2018, 2018, 2018, 2019, 2019, 2019, 2019, 2019, 2019,
2020, 2020, 2020, 2020, 2020, 2020, 2021, 2021, 2021, 2021, 2021,
2021, 2022, 2022, 2022, 2022, 2022, 2022), ID = c("5_2017", "6_2017",
"7_2017", "8_2017", "9_2017", "10_2017", "5_2018", "6_2018",
"7_2018", "8_2018", "9_2018", "10_2018", "5_2019", "6_2019",
"7_2019", "8_2019", "9_2019", "10_2019", "5_2020", "6_2020",
"7_2020", "8_2020", "9_2020", "10_2020", "5_2021", "6_2021",
"7_2021", "8_2021", "9_2021", "10_2021", "5_2022", "6_2022",
"7_2022", "8_2022", "9_2022", "10_2022"), ID2 = c("6_2017", "7_2017",
"8_2017", "9_2017", "10_2017", "11_2017", "6_2018", "7_2018",
"8_2018", "9_2018", "10_2018", "11_2018", "6_2019", "7_2019",
"8_2019", "9_2019", "10_2019", "11_2019", "6_2020", "7_2020",
"8_2020", "9_2020", "10_2020", "11_2020", "6_2021", "7_2021",
"8_2021", "9_2021", "10_2021", "11_2021", "6_2022", "7_2022",
"8_2022", "9_2022", "10_2022", "11_2022"), chla = c(34, 34, 34,
34, 92.5, 156, 34, 34, 20, 34, 34, 34, 34, 34, 34, 34, 34, 176,
34, 34, NA, 34, 34, 34, 34, 34, 34, 34, 34, 30, 34, 34, 34, 34,
34, 151), Consec_2month_median = c(34, 34, 34, 63.25, 124.25,
NA, 34, 27, 27, 34, 34, NA, 34, 34, 34, 34, 105, NA, 34, NA,
NA, 34, 34, NA, 34, 34, 34, 34, 32, NA, 34, 34, 34, 34, 92.5,
NA)), row.names = c(NA, -36L), class = "data.frame")
I could live with the single value being reported as the median in instances where there is missing data or the consecutive month falls outside the time frame (e.g. the chla value for month 10 being used instead of an NA due to the absence of month 11 data)
library(dplyr)
left_join(df, df[,c("ID", "chla")],
by = c("ID2" = "ID"), suffix = c("", "2")) %>%
rowwise() %>%
mutate(Consec_2month_median = median(c(chla, chla2))) %>%
select(-chla2)
#> # A tibble: 36 × 7
#> # Rowwise:
#> Date Month Year ID ID2 chla Consec_2month_median
#> <dttm> <dbl> <int> <chr> <chr> <dbl> <dbl>
#> 1 2017-05-19 00:00:00 5 2017 5_2017 6_2017 34 34
#> 2 2017-06-27 00:00:00 6 2017 6_2017 7_2017 34 34
#> 3 2017-07-10 00:00:00 7 2017 7_2017 8_2017 34 34
#> 4 2017-08-21 00:00:00 8 2017 8_2017 9_2017 34 63.2
#> 5 2017-09-08 00:00:00 9 2017 9_2017 10_2017 92.5 124.
#> 6 2017-10-03 00:00:00 10 2017 10_2017 11_2017 156 NA
#> 7 2018-05-14 00:00:00 5 2018 5_2018 6_2018 34 34
#> 8 2018-06-13 00:00:00 6 2018 6_2018 7_2018 34 27
#> 9 2018-07-17 00:00:00 7 2018 7_2018 8_2018 20 27
#> 10 2018-08-06 00:00:00 8 2018 8_2018 9_2018 34 34
#> # ℹ 26 more rows
Created on 2024-10-16 with reprex v2.0.2