I am working with climate data that I have added dates to and subsetted down to just including the data that has the temperature above the 90th percentile for 3 or more days.
set.seed(12368)
A <- data.frame("Y" = rep(1990:2009, each = 360), "M" = rep(1:12, each = 30, times = 20), "D" = rep(1:30, 240), "Temp" = c(sample(0:35, size = 7200, replace = TRUE)))
B <- data.frame("Y" = rep(1990:2009, each = 360), "M" = rep(1:12, each = 30, times = 20), "D" = rep(1:30, 240), "Temp" = c(sample(-10:22, size = 7200, replace = TRUE)))
C <- data.frame("Y" = rep(1990:2009, each = 360), "M" = rep(1:12, each = 30, times = 20), "D" = rep(1:30, 240), "Temp" = c(sample(3:42, size = 7200, replace = TRUE)))
climate <- list("Alist" = A, "Blist" = B, "Clist" = C)
#climate
library(dplyr)
library(magrittr)
datedclimate <- lapply(lapply(climate,
function(x)
x %<>%
mutate("Date" = as.Date(with(x, paste(Y,M,D, sep="-")),"%Y-%m-%d"))
),
function(y)
y %>% relocate("Date")
)
datedclimate
tm <- lapply(datedclimate, \(x) {
y <- as.data.frame(subset(x,
Temp > quantile(Temp, probs = 0.90, na.rm = TRUE))
)
y[unique(
sort(
unlist(
lapply( # this iterates through all of my data sets
which(
c(diff(y[,'Date'])==1, FALSE) & c(diff(y[,'Date'], diff=2)==0, FALSE, FALSE)),
\(x) x + 0:2)
))),]
}
)
tm
The next step I am looking to do is then find the average number of days for each consecutive amount of days. For example, the subsetted C list has 5 stretches of 3 consecutive days and 1 stretch of 4 consecutive days. Therefore, the average number of consecutive days would be 3.17 from (3+3+3+3+3+4)/6.
I tried to modify the code from this question, but it does not return the right numbers. I would also prefer to modify/work with the subsetted data, tm, from above if possible.
ConsecMean <- function(x) {
x <- ifelse(x > quantile(x, probs = 0.90, na.rm = TRUE), 0, 1)
cs <- cumsum(x)
cs <- cs[x == 0]
mean <- mean(table(cs))
return(mean)
}
tri <- lapply(lapply(datedclimate, "[[", 5), ConsecMean)
tri
> tri
$Alist
[1] 1.105735
$Blist
[1] 1.104746
$Clist
[1] 1.099693
So, I'm looking for a function that I can apply to the list of data frames (climate) that will return this average number for each data frame in the list. I'm thinking I should use lapply, but I don't know where to go from there. I also tried to use rle, but it is not compatible with the Date format and I also think that rle is not the right function to use for this lol. What code/functions would you recommend using to solve this problem?
Using tm
in the Note at the end seqid
assigns a unique id which is the same for all Dates in any particular consecutive sequence of dates, table
counts the elements in each such group giving a vector of counts and mean
takes their average.
library(collapse)
sapply(tm, function(x) mean(table(seqid(x$Date))))
## Alist Blist Clist
## 3.000000 3.000000 3.166667
tm <- list(Alist = structure(list(Date = structure(c(9344, 9345, 9346,
10998, 10999, 11000, 11007, 11008, 11009, 11428, 11429, 11430,
11952, 11953, 11954, 13415, 13416, 13417), class = "Date"), Y = c(1995L,
1995L, 1995L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2001L,
2001L, 2001L, 2002L, 2002L, 2002L, 2006L, 2006L, 2006L), M = c(8L,
8L, 8L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 9L, 9L, 9L, 9L, 9L,
9L), D = c(2L, 3L, 4L, 11L, 12L, 13L, 20L, 21L, 22L, 16L, 17L,
18L, 22L, 23L, 24L, 24L, 25L, 26L), Temp = c(33L, 34L, 33L, 34L,
33L, 33L, 34L, 33L, 35L, 34L, 33L, 33L, 35L, 35L, 33L, 34L, 35L,
35L)), row.names = c(2012L, 2013L, 2014L, 3641L, 3642L, 3643L,
3650L, 3651L, 3652L, 4066L, 4067L, 4068L, 4582L, 4583L, 4584L,
6024L, 6025L, 6026L), class = "data.frame"), Blist = structure(list(
Date = structure(c(8041, 8042, 8043, 9820, 9821, 9822, 10759,
10760, 10761, 12383, 12384, 12385, 13116, 13117, 13118), class = "Date"),
Y = c(1992L, 1992L, 1992L, 1996L, 1996L, 1996L, 1999L, 1999L,
1999L, 2003L, 2003L, 2003L, 2005L, 2005L, 2005L), M = c(1L,
1L, 1L, 11L, 11L, 11L, 6L, 6L, 6L, 11L, 11L, 11L, 11L, 11L,
12L), D = c(7L, 8L, 9L, 20L, 21L, 22L, 17L, 18L, 19L, 27L,
28L, 29L, 29L, 30L, 1L), Temp = c(20L, 20L, 22L, 22L, 20L,
20L, 20L, 20L, 21L, 20L, 20L, 20L, 20L, 21L, 21L)), row.names = c(727L,
728L, 729L, 2480L, 2481L, 2482L, 3407L, 3408L, 3409L, 5007L,
5008L, 5009L, 5729L, 5730L, 5731L), class = "data.frame"), Clist = structure(list(
Date = structure(c(7504, 7505, 7506, 7616, 7617, 7618, 10090,
10091, 10092, 10635, 10636, 10637, 10700, 10701, 10702, 13591,
13592, 13593, 13594), class = "Date"), Y = c(1990L, 1990L,
1990L, 1990L, 1990L, 1990L, 1997L, 1997L, 1997L, 1999L, 1999L,
1999L, 1999L, 1999L, 1999L, 2007L, 2007L, 2007L, 2007L),
M = c(7L, 7L, 7L, 11L, 11L, 11L, 8L, 8L, 8L, 2L, 2L, 2L,
4L, 4L, 4L, 3L, 3L, 3L, 3L), D = c(19L, 20L, 21L, 8L, 9L,
10L, 17L, 18L, 19L, 13L, 14L, 15L, 19L, 20L, 21L, 19L, 20L,
21L, 22L), Temp = c(39L, 39L, 40L, 39L, 40L, 41L, 39L, 39L,
42L, 41L, 41L, 42L, 40L, 40L, 42L, 42L, 42L, 40L, 41L)), row.names = c(199L,
200L, 201L, 308L, 309L, 310L, 2747L, 2748L, 2749L, 3283L, 3284L,
3285L, 3349L, 3350L, 3351L, 6199L, 6200L, 6201L, 6202L), class = "data.frame"))