I have a dataframe like the following:
df <- tibble(
period = list(
c("09:34:00-20:40:00", "20:57:00-21:00:00"),
c("16:03:00-19:00:00", "19:10:00-21:00:00", "21:15-24:00"),
"7:02:00-13:20:00",
c("9:00:00-12:15:00", "14:30:00-16:30:00")
)
)
I want to create a new variable that contains the number of total hours (including decimal points) between the period 12:00 to 15:00, in any of the concatenated periods (or the single time, not concatenated. The output should look like the following:
df <- tibble(
period = list(
c("09:34:00-20:40:00", "20:57:00-21:00:00"),
c("16:03:00-19:00:00", "19:10:00-21:00:00", "21:15-24:00"),
"7:02:00-13:20:00",
c("9:00:00-12:15:00", "14:30:00-16:30:00")
),
hrs = list(3.0,0,1.33,0.75)
)
How can I create this new variable extracting the number of hours that overlap with the specified period?
library(dplyr)
library(tidyr)
library(anytime)
df %>%
mutate(id = row_number()) %>%
unnest(period) %>%
separate(period, into = c("p1", "p2"), sep = "-", remove = FALSE) %>%
mutate(across(c(p1, p2),
~anytime(paste("2020-01-01", .x),
tz = "UTC", asUTC = TRUE))) %>%
mutate(across(c(p1, p2),
~case_when(.x > as.POSIXct("2020-01-01 15:00", tz = "UTC") ~
as.POSIXct("2020-01-01 15:00", tz = "UTC"),
.x < as.POSIXct("2020-01-01 12:00", tz = "UTC") ~
as.POSIXct("2020-01-01 12:00", tz = "UTC"),
.default = .x))) %>%
mutate(hrs = difftime(p2, p1, units = "hours")) %>%
summarize(period = list(period),
hrs = sum(hrs),
.by = id) %>%
select(-id)
#> # A tibble: 4 × 2
#> period hrs
#> <list> <drtn>
#> 1 <chr [2]> 3.000000 hours
#> 2 <chr [3]> 0.000000 hours
#> 3 <chr [1]> 1.333333 hours
#> 4 <chr [2]> 0.750000 hours
Created on 2024-12-13 with reprex v2.0.2
If you want to do this for multiple periods and calculate the total sum of overlapping hours, then we can do it like below (I am borrowing Thomas's idea by using pmin
and pmax
to make this look cleaner):
total_hours <- \(p1, p2, s){
xt1 <- pmax(pmin(p1, s[2]), s[1])
xt2 <- pmax(pmin(p2, s[2]), s[1])
return(abs(difftime(xt2, xt1, units = "hours")))
}
s1 <- c(as.POSIXct("2020-01-01 12:00", tz = "UTC"),
as.POSIXct("2020-01-01 15:00", tz = "UTC"))
s2 <- c(as.POSIXct("2020-01-01 04:00", tz = "UTC"),
as.POSIXct("2020-01-01 06:00", tz = "UTC"))
s3 <- c(as.POSIXct("2020-01-01 17:00", tz = "UTC"),
as.POSIXct("2020-01-01 18:00", tz = "UTC"))
df %>%
mutate(id = row_number()) %>%
unnest(period) %>%
separate(period, into = c("p1", "p2"), sep = "-", remove = FALSE) %>%
mutate(across(c(p1, p2),
~anytime(paste("2020-01-01", .x),
tz = "UTC", asUTC = TRUE))) %>%
mutate(hrs = Reduce("+", lapply(list(s1, s2, s3),
\(s) total_hours(p1, p2, s)))) %>%
summarize(period = list(period),
hrs = sum(hrs),
.by = id) %>%
select(-id)
#> # A tibble: 4 × 2
#> period hrs
#> <list> <drtn>
#> 1 <chr [2]> 4.000000 hours
#> 2 <chr [3]> 1.000000 hours
#> 3 <chr [1]> 3.333333 hours
#> 4 <chr [2]> 2.750000 hours
Created on 2024-12-16 with reprex v2.0.2