I am having an awful lot of trouble trying to get summarized rows from a data frame that has rows with duplicated timestamps, but has different data in each row, so that some variables are NA, and others are valid data. The goal is to get one row per timestamp, with all valid columns filled in. Also, some of the duplicated timestamps have data in both rows, so both rows need to be returned in this case. Here is an example input data:
input_data <- structure(list(File_Category = c("OneMinute", "OneMinuteExtra",
"OneMinute", "OneMinuteExtra", "OneMinute", "OneMinuteExtra",
"OneMinute", "OneMinuteExtra", "OneMinute", "OneMinuteExtra",
"OneMinute", "OneMinuteExtra"), File_Station_Name = c("Lincoln 1500 N 45th",
"Lincoln 1500 N 45th", "Lincoln 1500 N 45th", "Lincoln 1500 N 45th",
"Lincoln 1500 N 45th", "Lincoln 1500 N 45th", "Lincoln 1500 N 45th",
"Lincoln 1500 N 45th", "Lincoln 1500 N 45th", "Lincoln 1500 N 45th",
"Lincoln 1500 N 45th", "Lincoln 1500 N 45th"), TIMESTAMP = c("2019-09-05 00:57:00",
"2019-09-05 00:57:00", "2019-09-05 00:58:00", "2019-09-05 00:58:00",
"2019-09-05 00:59:00", "2019-09-05 00:59:00", "2019-09-05 01:00:00",
"2019-09-05 01:00:00", "2019-09-05 01:01:00", "2019-09-05 01:01:00",
"2019-09-05 01:02:00", "2019-09-05 01:02:00"), RECORD = c(423L,
423L, 424L, 424L, 425L, 425L, 426L, 426L, 427L, 427L, 428L, 428L
), Ta_2m_Avg = c(23.05, 22.93, 23.05, NA, 22.99, NA, 22.97, NA,
22.97, NA, 22.93, NA), TaMax_2m = c(23.2, 23.01, 23.2, NA, 23.1,
NA, 23.11, NA, 23.11, NA, 23.01, NA), TaMin_2m = c(22.96, 22.83,
22.96, NA, 22.92, NA, 22.92, NA, 22.92, NA, 22.83, NA), RH_2m_Avg = c(64.07,
65, 64.07, NA, 63.84, NA, 64.07, NA, 64.5, NA, 65, NA), RHMax_2m = c(64.41,
65.28, 64.41, NA, 63.91, NA, 64.25, NA, 65.11, NA, 65.28, NA),
RHMin_2m = c(63.85, 64.71, 63.85, NA, 63.75, NA, 63.82, NA,
64.11, NA, 64.71, NA), Dp_2m_Avg = c(15.89, 16.01, 15.89,
NA, 15.78, NA, 15.82, NA, 15.93, NA, 16.01, NA), DpMax_2m = c(15.99,
16.14, 15.99, NA, 15.91, NA, 15.98, NA, 16.08, NA, 16.14,
NA), DpMin_2m = c(15.78, 15.88, 15.78, NA, 15.69, NA, 15.71,
NA, 15.78, NA, 15.88, NA), HeatIndex_2m_Avg = c(23.06, 22.96,
23.06, NA, 23, NA, 22.98, NA, 22.99, NA, 22.96, NA), HeatIndexMax_2m = c(23.22,
23.04, 23.22, NA, 23.12, NA, 23.13, NA, 23.13, NA, 23.04,
NA), WindChill_2m_Avg = c(24.85, 26.72, 24.85, NA, 24.67,
NA, 24.88, NA, 25.9, NA, 26.72, NA), WindChillMin_2m = c(24.6,
25.01, 24.6, NA, 24.42, NA, 24.75, NA, 24.6, NA, 25.01, NA
), WndAveSpd_3m = c(0.752, 0.044, 0.752, NA, 0.969, NA, 0.564,
NA, 0.419, NA, 0.044, NA), WndVecMagAve_3m = c(0.715, 0.044,
0.715, NA, 0.959, NA, 0.552, NA, 0.418, NA, 0.044, NA), WndAveDir_3m = c(147.7,
120.3, 147.7, NA, 140.2, NA, 128.1, NA, 140.4, NA, 120.3,
NA), WndAveDirSD_3m = c(18.08, 0.176, 18.08, NA, 8.27, NA,
11.74, NA, 3.204, NA, 0.176, NA), WndMaxSpd5s_3m = c(1.1,
0.3, 1.1, NA, 1.366, NA, 0.7, NA, 1, NA, 0.3, NA), WndMax_5sec_Dir_3m = c(157.8,
120.2, 157.8, NA, 126.9, NA, 139.5, NA, 142.7, NA, 120.2,
NA), PresAvg_1pnt5m = c(977.1187, 977.1306, 977.1187, NA,
977.1209, NA, 977.127, NA, 977.1389, NA, 977.1306, NA), PresMax_1pnt5m = c(977.1798,
977.1832, 977.1798, NA, 977.1459, NA, 977.1791, NA, 977.15,
NA, 977.1832, NA), PresMin_1pnt5m = c(977.0795, 977.1168,
977.0795, NA, 977.0795, NA, 977.0835, NA, 977.0835, NA, 977.1168,
NA), Solar_2m_Avg = c(2.414, 0.849, 2.414, NA, 2.207, NA,
1.623, NA, 1.185, NA, 0.849, NA), Rain_1m_Tot = c(0L, 0L,
0L, NA, 0L, NA, 0L, NA, 0L, NA, 0L, NA), Ts_bare_10cm_Avg = c(28.46,
28.39, 28.46, NA, 28.44, NA, 28.42, NA, 28.41, NA, 28.39,
NA), TsMax_bare_10cm = c(28.47, 28.4, 28.47, NA, 28.45, NA,
28.43, NA, 28.41, NA, 28.4, NA), TsMin_bare_10cm = c(28.45,
28.38, 28.45, NA, 28.43, NA, 28.42, NA, 28.4, NA, 28.38,
NA), BattVolts_Min = c(12.76, 12.75, 12.76, NA, 12.76, NA,
12.76, NA, 12.75, NA, 12.75, NA), LithBatt_Min = c(3.447,
3.447, 3.447, NA, 3.447, NA, 3.447, NA, 3.447, NA, 3.447,
NA), MaintMode = c(0L, 0L, 0L, NA, 0L, NA, 0L, NA, 0L, NA,
0L, NA), Ta_10m_Avg = c(NA, NA, NA, 23.65, NA, 23.7, NA,
23.64, NA, 23.58, NA, 23.45), TaMax_10m = c(NA, NA, NA, 23.8,
NA, 23.99, NA, 23.94, NA, 23.9, NA, 23.59), TaMin_10m = c(NA,
NA, NA, 23.55, NA, 23.57, NA, 23.52, NA, 23.43, NA, 23.34
), RH_10m_Avg = c(NA, NA, NA, 60.32, NA, 60.5, NA, 60.68,
NA, 61.27, NA, 61.65), RHMax_10m = c(NA, NA, NA, 60.46, NA,
60.78, NA, 60.84, NA, 61.64, NA, 61.82), RHMin_10m = c(NA,
NA, NA, 60.18, NA, 60.31, NA, 60.53, NA, 60.76, NA, 61.54
), Dp_10m_Avg = c(NA, NA, NA, 15.52, NA, 15.61, NA, 15.61,
NA, 15.7, NA, 15.67), DpMax_10m = c(NA, NA, NA, 15.7, NA,
15.96, NA, 15.92, NA, 16.02, NA, 15.82), DpMin_10m = c(NA,
NA, NA, 15.39, NA, 15.44, NA, 15.46, NA, 15.47, NA, 15.55
), HeatIndex_10m_Avg = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), HeatIndexMax_10m = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), WindChill_10m_Avg = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), WindChillMin_10m = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), WndAveSpd_10m = c(NA,
NA, NA, 1.266, NA, 1.005, NA, 1.188, NA, 0.394, NA, 0.508
), WndVecMagAve_10m = c(NA, NA, NA, 1.247, NA, 0.969, NA,
1.163, NA, 0.377, NA, 0.499), WndAveDir_10m = c(NA, NA, NA,
147.9, NA, 152, NA, 139, NA, 126.9, NA, 143.5), WndAveDirSD_10m = c(NA,
NA, NA, 9.94, NA, 15.42, NA, 11.81, NA, 16.94, NA, 11.08),
WndMaxSpd5s_10m = c(NA, NA, NA, 1.799, NA, 1.599, NA, 1.633,
NA, 0.8, NA, 0.8), WndMax_5sec_Dir_10m = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), WndMaxSpd5s_Dir_10m = c(NA,
NA, NA, 143.4, NA, 144.4, NA, 131.3, NA, 95.8, NA, 160.7),
source = c("D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min_extra.csv.backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min_extra.csv.backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min_extra.csv.backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min_extra.csv.backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min_extra.csv.backup"
)), class = "data.frame", row.names = c(NA, -12L))
Desired output:
desired_output <- structure(list(File_Category = c("OneMinute", "OneMinuteExtra",
"OneMinute", "OneMinute", "OneMinute", "OneMinute", "OneMinute"
), File_Station_Name = c("Lincoln 1500 N 45th", "Lincoln 1500 N 45th",
"Lincoln 1500 N 45th", "Lincoln 1500 N 45th", "Lincoln 1500 N 45th",
"Lincoln 1500 N 45th", "Lincoln 1500 N 45th"), TIMESTAMP = c("2019-09-05 00:57:00",
"2019-09-05 00:57:00", "2019-09-05 00:58:00", "2019-09-05 00:59:00",
"2019-09-05 01:00:00", "2019-09-05 01:01:00", "2019-09-05 01:02:00"
), RECORD = c(423L, 423L, 424L, 425L, 426L, 427L, 428L), Ta_2m_Avg = c(23.05,
22.93, 23.05, 22.99, 22.97, 22.97, 22.93), TaMax_2m = c(23.2,
23.01, 23.2, 23.1, 23.11, 23.11, 23.01), TaMin_2m = c(22.96,
22.83, 22.96, 22.92, 22.92, 22.92, 22.83), RH_2m_Avg = c(64.07,
65, 64.07, 63.84, 64.07, 64.5, 65), RHMax_2m = c(64.41, 65.28,
64.41, 63.91, 64.25, 65.11, 65.28), RHMin_2m = c(63.85, 64.71,
63.85, 63.75, 63.82, 64.11, 64.71), Dp_2m_Avg = c(15.89, 16.01,
15.89, 15.78, 15.82, 15.93, 16.01), DpMax_2m = c(15.99, 16.14,
15.99, 15.91, 15.98, 16.08, 16.14), DpMin_2m = c(15.78, 15.88,
15.78, 15.69, 15.71, 15.78, 15.88), HeatIndex_2m_Avg = c(23.06,
22.96, 23.06, 23, 22.98, 22.99, 22.96), HeatIndexMax_2m = c(23.22,
23.04, 23.22, 23.12, 23.13, 23.13, 23.04), WindChill_2m_Avg = c(24.85,
26.72, 24.85, 24.67, 24.88, 25.9, 26.72), WindChillMin_2m = c(24.6,
25.01, 24.6, 24.42, 24.75, 24.6, 25.01), WndAveSpd_3m = c(0.752,
0.044, 0.752, 0.969, 0.564, 0.419, 0.044), WndVecMagAve_3m = c(0.715,
0.044, 0.715, 0.959, 0.552, 0.418, 0.044), WndAveDir_3m = c(147.7,
120.3, 147.7, 140.2, 128.1, 140.4, 120.3), WndAveDirSD_3m = c(18.08,
0.176, 18.08, 8.27, 11.74, 3.204, 0.176), WndMaxSpd5s_3m = c(1.1,
0.3, 1.1, 1.366, 0.7, 1, 0.3), WndMax_5sec_Dir_3m = c(157.8,
120.2, 157.8, 126.9, 139.5, 142.7, 120.2), PresAvg_1pnt5m = c(977.1187,
977.1306, 977.1187, 977.1209, 977.127, 977.1389, 977.1306), PresMax_1pnt5m = c(977.1798,
977.1832, 977.1798, 977.1459, 977.1791, 977.15, 977.1832), PresMin_1pnt5m = c(977.0795,
977.1168, 977.0795, 977.0795, 977.0835, 977.0835, 977.1168),
Solar_2m_Avg = c(2.414, 0.849, 2.414, 2.207, 1.623, 1.185,
0.849), Rain_1m_Tot = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), Ts_bare_10cm_Avg = c(28.46,
28.39, 28.46, 28.44, 28.42, 28.41, 28.39), TsMax_bare_10cm = c(28.47,
28.4, 28.47, 28.45, 28.43, 28.41, 28.4), TsMin_bare_10cm = c(28.45,
28.38, 28.45, 28.43, 28.42, 28.4, 28.38), BattVolts_Min = c(12.76,
12.75, 12.76, 12.76, 12.76, 12.75, 12.75), LithBatt_Min = c(3.447,
3.447, 3.447, 3.447, 3.447, 3.447, 3.447), MaintMode = c(0L,
0L, 0L, 0L, 0L, 0L, 0L), Ta_10m_Avg = c(NA, NA, 23.65, 23.7,
23.64, 23.58, 23.45), TaMax_10m = c(NA, NA, 23.8, 23.99,
23.94, 23.9, 23.59), TaMin_10m = c(NA, NA, 23.55, 23.57,
23.52, 23.43, 23.34), RH_10m_Avg = c(NA, NA, 60.32, 60.5,
60.68, 61.27, 61.65), RHMax_10m = c(NA, NA, 60.46, 60.78,
60.84, 61.64, 61.82), RHMin_10m = c(NA, NA, 60.18, 60.31,
60.53, 60.76, 61.54), Dp_10m_Avg = c(NA, NA, 15.52, 15.61,
15.61, 15.7, 15.67), DpMax_10m = c(NA, NA, 15.7, 15.96, 15.92,
16.02, 15.82), DpMin_10m = c(NA, NA, 15.39, 15.44, 15.46,
15.47, 15.55), HeatIndex_10m_Avg = c(NA, NA, NA, NA, NA,
NA, NA), HeatIndexMax_10m = c(NA, NA, NA, NA, NA, NA, NA),
WindChill_10m_Avg = c(NA, NA, NA, NA, NA, NA, NA), WindChillMin_10m = c(NA,
NA, NA, NA, NA, NA, NA), WndAveSpd_10m = c(NA, NA, 1.266,
1.005, 1.188, 0.394, 0.508), WndVecMagAve_10m = c(NA, NA,
1.247, 0.969, 1.163, 0.377, 0.499), WndAveDir_10m = c(NA,
NA, 147.9, 152, 139, 126.9, 143.5), WndAveDirSD_10m = c(NA,
NA, 9.94, 15.42, 11.81, 16.94, 11.08), WndMaxSpd5s_10m = c(NA,
NA, 1.799, 1.599, 1.633, 0.8, 0.8), WndMax_5sec_Dir_10m = c(NA,
NA, NA, NA, NA, NA, NA), WndMaxSpd5s_Dir_10m = c(NA, NA,
143.4, 144.4, 131.3, 95.8, 160.7), source = c("D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup",
"D:\\Data_Collected_11202023\\Data_Collected/0034/0034_1min.csv (2).backup"
)), class = "data.frame", row.names = c(NA, -7L))
I get errors using just about everything. Here's an example.
result <- input_data %>%
group_by(TIMESTAMP) %>%
summarize(across(File_Category:File_Station_Name, first),
across(RECORD, first),
across(Ta_2m_Avg:WndMaxSpd5s_Dir_10m, na.omit),
across(source, first))
Error in names(dots)[[i]] : subscript out of bounds
I know I've asked similar questions before. Answers that have helped me in the past don't seem to work for this. I don't know why. I've read in the data using read_csv so that TIMESTAMP is posixct class. Thank you.
Here's a solution using your example input data. It's not particularly tidy, but maybe you can build on this.
input_data$TIMESTAMP <- as.POSIXct(input_data$TIMESTAMP,
format = "%Y-%m-%d %H:%M:%S",
tz = "UTC")
desired_output$TIMESTAMP <- as.POSIXct(desired_output$TIMESTAMP,
format = "%Y-%m-%d %H:%M:%S",
tz = "UTC")
library(dplyr)
library(tidyr)
result <- input_data %>%
group_by(TIMESTAMP) %>%
# replace NAs with unique values per group (creates duplicates)
fill(Ta_2m_Avg:WndMaxSpd5s_Dir_10m, .direction = "downup") %>%
# remove rows based on duplicates in column range
# select columns range by index or by name
# .[which(!duplicated(.[,c(which(colnames(.) == "Ta_2m_Avg"):which(colnames(.) == "WndMaxSpd5s_Dir_10m"))])),]
.[which(!duplicated(.[,c(5:54)])),]
# confirm
library(diffdf)
diffdf(desired_output, result)
# > No issues were found!