I have two dataframes. DF1 has time intervals including milliseconds, with a corresponding value (behavior). DF2 has a column of times (including ms).
DF1:
START STOP behavior
1 2023-07-20 14:07:39.00 2023-07-20 12:07:39.14 dig
2 2023-07-20 14:07:39.27 2023-07-20 12:07:39.90 pig
DF2:
x time label
133202 20/07/2023 12:07:39.020 2023-07-20 12:07:39 NA
133203 20/07/2023 12:07:39.040 2023-07-20 12:07:39 NA
133204 20/07/2023 12:07:39.060 2023-07-20 12:07:39 NA
133205 20/07/2023 12:07:39.080 2023-07-20 12:07:39 NA
133206 20/07/2023 12:07:39.100 2023-07-20 12:07:39 NA
133207 20/07/2023 12:07:39.120 2023-07-20 12:07:39 NA
133208 20/07/2023 12:07:39.140 2023-07-20 12:07:39 NA
133209 20/07/2023 12:07:39.160 2023-07-20 12:07:39 NA
133210 20/07/2023 12:07:39.180 2023-07-20 12:07:39 NA
133211 20/07/2023 12:07:39.200 2023-07-20 12:07:39 NA
133212 20/07/2023 12:07:39.220 2023-07-20 12:07:39 NA
133213 20/07/2023 12:07:39.240 2023-07-20 12:07:39 NA
133214 20/07/2023 12:07:39.260 2023-07-20 12:07:39 NA
133215 20/07/2023 12:07:39.280 2023-07-20 12:07:39 NA
133216 20/07/2023 12:07:39.300 2023-07-20 12:07:39 NA
133217 20/07/2023 12:07:39.320 2023-07-20 12:07:39 NA
133218 20/07/2023 12:07:39.340 2023-07-20 12:07:39 NA
133219 20/07/2023 12:07:39.360 2023-07-20 12:07:39 NA
133220 20/07/2023 12:07:39.380 2023-07-20 12:07:39 NA
133221 20/07/2023 12:07:39.400 2023-07-20 12:07:39 NA
I want to produce a dataframe that is DF2, but with a column that corresponds to behavior in DF1 if the time occurs within time intervals (including milliseconds) in DF1.
Desired output:
x time label
133202 20/07/2023 12:07:39.020 2023-07-20 12:07:39 dig
133203 20/07/2023 12:07:39.040 2023-07-20 12:07:39 dig
133204 20/07/2023 12:07:39.060 2023-07-20 12:07:39 dig
133205 20/07/2023 12:07:39.080 2023-07-20 12:07:39 dig
133206 20/07/2023 12:07:39.100 2023-07-20 12:07:39 dig
133207 20/07/2023 12:07:39.120 2023-07-20 12:07:39 dig
133208 20/07/2023 12:07:39.140 2023-07-20 12:07:39 dig
133209 20/07/2023 12:07:39.160 2023-07-20 12:07:39 <NA>
133210 20/07/2023 12:07:39.180 2023-07-20 12:07:39 <NA>
133211 20/07/2023 12:07:39.200 2023-07-20 12:07:39 <NA>
133212 20/07/2023 12:07:39.220 2023-07-20 12:07:39 <NA>
133213 20/07/2023 12:07:39.240 2023-07-20 12:07:39 <NA>
133214 20/07/2023 12:07:39.260 2023-07-20 12:07:39 <NA>
133215 20/07/2023 12:07:39.280 2023-07-20 12:07:39 pig
133216 20/07/2023 12:07:39.300 2023-07-20 12:07:39 pig
133217 20/07/2023 12:07:39.320 2023-07-20 12:07:39 pig
133218 20/07/2023 12:07:39.340 2023-07-20 12:07:39 pig
133219 20/07/2023 12:07:39.360 2023-07-20 12:07:39 pig
133220 20/07/2023 12:07:39.380 2023-07-20 12:07:39 pig
133221 20/07/2023 12:07:39.400 2023-07-20 12:07:39 pig
Here are the data inputs:
DF1
#new dataframe for testing#
START<-c("2023-07-20 14:07:39.01", "2023-07-20 14:07:39.29")
START<- as.POSIXct(START, "%Y-%m-%d %H:%M:%OS",tz = "UTC")
START<-format(START, "%Y-%m-%d %H:%M:%OS3")
STOP<-c("2023-07-20 14:07:39.14","2023-07-20 14:07:39.90")
STOP<- as.POSIXct(STOP, "%Y-%m-%d %H:%M:%OS",tz = "UTC")
STOP<-format(STOP, "%Y-%m-%d %H:%M:%OS2")
behavior<-c("dig", "pig")
DF1<-data.frame(START, STOP, behavior)
DF1$START<- as.POSIXct(DF1$START, "%Y-%m-%d %H:%M:%OS",tz = "UTC")
DF1$STOP<-as.POSIXct(DF1$STOP, "%Y-%m-%d %H:%M:%OS",tz = "UTC")
DF1$START<-format(DF1$START, "%Y-%m-%d %H:%M:%OS2")
DF1$STOP<-format(DF1$STOP, "%Y-%m-%d %H:%M:%OS2")
DF2 (x is time shown with ms, using format(df2$x, "%Y-%m-%d %H:%M:%OS2")
structure(list(x = c("20/07/2023 12:07:39.020", "20/07/2023 12:07:39.040",
"20/07/2023 12:07:39.060", "20/07/2023 12:07:39.080", "20/07/2023 12:07:39.100",
"20/07/2023 12:07:39.120", "20/07/2023 12:07:39.140", "20/07/2023 12:07:39.160",
"20/07/2023 12:07:39.180", "20/07/2023 12:07:39.200", "20/07/2023 12:07:39.220",
"20/07/2023 12:07:39.240", "20/07/2023 12:07:39.260", "20/07/2023 12:07:39.280",
"20/07/2023 12:07:39.300", "20/07/2023 12:07:39.320", "20/07/2023 12:07:39.340",
"20/07/2023 12:07:39.360", "20/07/2023 12:07:39.380", "20/07/2023 12:07:39.400"
), time = structure(c(1689854859.02, 1689854859.04, 1689854859.06,
1689854859.08, 1689854859.1, 1689854859.12, 1689854859.14, 1689854859.16,
1689854859.18, 1689854859.2, 1689854859.22, 1689854859.24, 1689854859.26,
1689854859.28, 1689854859.3, 1689854859.32, 1689854859.34, 1689854859.36,
1689854859.38, 1689854859.4), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
label = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), row.names = 133202:133221, class = "data.frame")
I want to produce this:
dput(df2)
structure(list(x = c("20/07/2023 12:07:39.020", "20/07/2023 12:07:39.040",
"20/07/2023 12:07:39.060", "20/07/2023 12:07:39.080", "20/07/2023 12:07:39.100",
"20/07/2023 12:07:39.120", "20/07/2023 12:07:39.140", "20/07/2023 12:07:39.160",
"20/07/2023 12:07:39.180", "20/07/2023 12:07:39.200", "20/07/2023 12:07:39.220",
"20/07/2023 12:07:39.240", "20/07/2023 12:07:39.260", "20/07/2023 12:07:39.280",
"20/07/2023 12:07:39.300", "20/07/2023 12:07:39.320", "20/07/2023 12:07:39.340",
"20/07/2023 12:07:39.360", "20/07/2023 12:07:39.380", "20/07/2023 12:07:39.400"
), time = structure(c(1689854859.02, 1689854859.04, 1689854859.06,
1689854859.08, 1689854859.1, 1689854859.12, 1689854859.14, 1689854859.16,
1689854859.18, 1689854859.2, 1689854859.22, 1689854859.24, 1689854859.26,
1689854859.28, 1689854859.3, 1689854859.32, 1689854859.34, 1689854859.36,
1689854859.38, 1689854859.4), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
label = c("dig", "dig", "dig", "dig", "dig", "dig", "dig",
NA, NA, NA, NA, NA, NA, "pig", "pig", "pig", "pig", "pig",
"pig", "pig")), row.names = 133202:133221, class = "data.frame")
I can produce the desired result using the following code:
for (i in 1:nrow(df2)) {
time_val <- df2$time[i]
# Find the row where time_val falls within the interval
interval_row <- which(time_val >= df1$START & time_val <= df1$STOP)
# If there's a match, replace NA in label with the corresponding behavior
if (length(interval_row) > 0) {
behavior_val <- df1$behavior[interval_row]
df2$label[i] <- behavior_val
}
}
with the dummy data for DF1.
When I try to run using my full DF1 it doesn't seem to be finding the time intervals (throwing the error "Error in df2$label[i] <- behavior_val : replacement has length zero", even though the str() output is identical. I can't work out why. I have also noticed that this seems to only work when I use the columns of DF1 when they are format(DF1$STOP, "%Y-%m-%d %H:%M:%OS2")
Not when they are formatted as POSIXct, which makes me think this is a very made way to do what I think is a simple problem!
I also think this is a messy solution and the for loop takes a while to run. Is there a better way to get my desired results and still accounting for the milliseconds in the time intervals and data?
I started to look into data.table solutions, but it doesn't sound like this is great to use for time intervals with milliseconds?
Please help!
Thank you so much in advance!
If I understand you correctly, you would like to merge behavior to DF2
depending on whether x
falls into the interval defined by START
and STOP
?
Below, I assume that the time in DF1
is not 14h but 12h:
library(tidyverse)
DF1 <- as_tibble(DF1) |>
mutate(across(c(START, STOP), ymd_hms))
DF2 <- as_tibble(DF2) |>
mutate(x = dmy_hms(x))
DF2 |>
left_join(DF1, join_by(between(x, START, STOP))) |>
select(x, time, behavior)
#> # A tibble: 20 × 3
#> x time behavior
#> <dttm> <dttm> <chr>
#> 1 2023-07-20 12:07:39 2023-07-20 12:07:39 dig
#> 2 2023-07-20 12:07:39 2023-07-20 12:07:39 dig
#> 3 2023-07-20 12:07:39 2023-07-20 12:07:39 dig
#> 4 2023-07-20 12:07:39 2023-07-20 12:07:39 dig
#> 5 2023-07-20 12:07:39 2023-07-20 12:07:39 dig
#> 6 2023-07-20 12:07:39 2023-07-20 12:07:39 dig
#> 7 2023-07-20 12:07:39 2023-07-20 12:07:39 dig
#> 8 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>
#> 9 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>
#> 10 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>
#> 11 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>
#> 12 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>
#> 13 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>
#> 14 2023-07-20 12:07:39 2023-07-20 12:07:39 pig
#> 15 2023-07-20 12:07:39 2023-07-20 12:07:39 pig
#> 16 2023-07-20 12:07:39 2023-07-20 12:07:39 pig
#> 17 2023-07-20 12:07:39 2023-07-20 12:07:39 pig
#> 18 2023-07-20 12:07:39 2023-07-20 12:07:39 pig
#> 19 2023-07-20 12:07:39 2023-07-20 12:07:39 pig
#> 20 2023-07-20 12:07:39 2023-07-20 12:07:39 pig
Created on 2023-12-08 with reprex v2.0.2