This is a subset of the dataframe I have:
structure(list(name = c("waldorf", "waldorf", "waldorf", "waldorf",
"waldorf", "waldorf", "waldorf", "waldorf", "waldorf", "waldorf",
"waldorf", "waldorf", "waldorf", "waldorf", "waldorf", "waldorf",
"waldorf", "waldorf", "waldorf", "waldorf", "waldorf", "waldorf",
"waldorf", "waldorf"), date = structure(c(1559347200, 1559347200,
1559347200, 1559347200, 1559347200, 1559347200, 1559347200, 1559347200,
1559347200, 1559347200, 1559347200, 1559347200, 1559347200, 1559347200,
1559347200, 1559347200, 1559347200, 1559347200, 1559347200, 1559347200,
1559347200, 1559347200, 1559347200, 1559347200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), time = structure(c(0, 3600, 7200, 10800, 14400, 18000,
21600, 25200, 28800, 32400, 36000, 39600, 43200, 46800, 50400,
54000, 57600, 61200, 64800, 68400, 72000, 75600, 79200, 82800
), class = c("hms", "difftime"), units = "secs"), datetime = structure(c(1559347200,
1559350800, 1559354400, 1559358000, 1559361600, 1559365200, 1559368800,
1559372400, 1559376000, 1559379600, 1559383200, 1559386800, 1559390400,
1559394000, 1559397600, 1559401200, 1559404800, 1559408400, 1559412000,
1559415600, 1559419200, 1559422800, 1559426400, 1559430000), tzone = "UTC", class = c("POSIXct",
"POSIXt")), temp = c(72.9, 70.6, 69.7, 69.6, 68, 68.1, 67.9,
67.8, 69.6, 71.9, 75.6, 78.3, 80.1, 80.8, 82.8, 83.5, 83.3, 82.9,
81.5, 79.4, 77.6, 72.4, 70.6, 69.8)), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame"))
The expanded data frame has hourly temperature data for 5 years worth of days. How can I use the datetime or the other columns to calculate the average temperature from 1pm to 5pm for each day in the data frame? Can I do this in a dplyr string?
Here are two variations. The first filters the data to times from 1pm (13 in 24h time) to 4:59:59 and then summarizes to give one row per date.
library(dplyr); library(lubridate)
df |>
filter(hour(datetime) %in% 13:16) |>
summarize(avg_temp = mean(temp), .by = date)
# A tibble: 1 × 2
date avg_temp
<dttm> <dbl>
1 2019-06-01 00:00:00 82.6
Or we could could keep every row and add a new column that just references the times from 1-5pm for that date.
df |>
mutate(avg_temp = mean(temp[hour(datetime) %in% 13:16]), .by = date)
name date time datetime temp avg_temp
<chr> <dttm> <time> <dttm> <dbl> <dbl>
1 waldorf 2019-06-01 00:00:00 00:00 2019-06-01 00:00:00 72.9 82.6
2 waldorf 2019-06-01 00:00:00 01:00 2019-06-01 01:00:00 70.6 82.6
3 waldorf 2019-06-01 00:00:00 02:00 2019-06-01 02:00:00 69.7 82.6
4 waldorf 2019-06-01 00:00:00 03:00 2019-06-01 03:00:00 69.6 82.6
5 waldorf 2019-06-01 00:00:00 04:00 2019-06-01 04:00:00 68 82.6
6 waldorf 2019-06-01 00:00:00 05:00 2019-06-01 05:00:00 68.1 82.6
7 waldorf 2019-06-01 00:00:00 06:00 2019-06-01 06:00:00 67.9 82.6
8 waldorf 2019-06-01 00:00:00 07:00 2019-06-01 07:00:00 67.8 82.6
9 waldorf 2019-06-01 00:00:00 08:00 2019-06-01 08:00:00 69.6 82.6
10 waldorf 2019-06-01 00:00:00 09:00 2019-06-01 09:00:00 71.9 82.6
# ℹ 14 more rows