rtimemergedata.tableposixct

How to label rows with a specific value (including MS) if time occurs in given time interval in R


I have two dataframes. DF1 has time intervals including milliseconds, with a corresponding value (behavior). DF2 has a column of times (including ms).

DF1:

                   START                   STOP behavior
1 2023-07-20 14:07:39.00 2023-07-20 12:07:39.14      dig
2 2023-07-20 14:07:39.27 2023-07-20 12:07:39.90      pig

DF2:

                             x                time label
133202 20/07/2023 12:07:39.020 2023-07-20 12:07:39    NA
133203 20/07/2023 12:07:39.040 2023-07-20 12:07:39    NA
133204 20/07/2023 12:07:39.060 2023-07-20 12:07:39    NA
133205 20/07/2023 12:07:39.080 2023-07-20 12:07:39    NA
133206 20/07/2023 12:07:39.100 2023-07-20 12:07:39    NA
133207 20/07/2023 12:07:39.120 2023-07-20 12:07:39    NA
133208 20/07/2023 12:07:39.140 2023-07-20 12:07:39    NA
133209 20/07/2023 12:07:39.160 2023-07-20 12:07:39    NA
133210 20/07/2023 12:07:39.180 2023-07-20 12:07:39    NA
133211 20/07/2023 12:07:39.200 2023-07-20 12:07:39    NA
133212 20/07/2023 12:07:39.220 2023-07-20 12:07:39    NA
133213 20/07/2023 12:07:39.240 2023-07-20 12:07:39    NA
133214 20/07/2023 12:07:39.260 2023-07-20 12:07:39    NA
133215 20/07/2023 12:07:39.280 2023-07-20 12:07:39    NA
133216 20/07/2023 12:07:39.300 2023-07-20 12:07:39    NA
133217 20/07/2023 12:07:39.320 2023-07-20 12:07:39    NA
133218 20/07/2023 12:07:39.340 2023-07-20 12:07:39    NA
133219 20/07/2023 12:07:39.360 2023-07-20 12:07:39    NA
133220 20/07/2023 12:07:39.380 2023-07-20 12:07:39    NA
133221 20/07/2023 12:07:39.400 2023-07-20 12:07:39    NA

I want to produce a dataframe that is DF2, but with a column that corresponds to behavior in DF1 if the time occurs within time intervals (including milliseconds) in DF1.

Desired output:

                            x                time label
133202 20/07/2023 12:07:39.020 2023-07-20 12:07:39   dig
133203 20/07/2023 12:07:39.040 2023-07-20 12:07:39   dig
133204 20/07/2023 12:07:39.060 2023-07-20 12:07:39   dig
133205 20/07/2023 12:07:39.080 2023-07-20 12:07:39   dig
133206 20/07/2023 12:07:39.100 2023-07-20 12:07:39   dig
133207 20/07/2023 12:07:39.120 2023-07-20 12:07:39   dig
133208 20/07/2023 12:07:39.140 2023-07-20 12:07:39   dig
133209 20/07/2023 12:07:39.160 2023-07-20 12:07:39  <NA>
133210 20/07/2023 12:07:39.180 2023-07-20 12:07:39  <NA>
133211 20/07/2023 12:07:39.200 2023-07-20 12:07:39  <NA>
133212 20/07/2023 12:07:39.220 2023-07-20 12:07:39  <NA>
133213 20/07/2023 12:07:39.240 2023-07-20 12:07:39  <NA>
133214 20/07/2023 12:07:39.260 2023-07-20 12:07:39  <NA>
133215 20/07/2023 12:07:39.280 2023-07-20 12:07:39   pig
133216 20/07/2023 12:07:39.300 2023-07-20 12:07:39   pig
133217 20/07/2023 12:07:39.320 2023-07-20 12:07:39   pig
133218 20/07/2023 12:07:39.340 2023-07-20 12:07:39   pig
133219 20/07/2023 12:07:39.360 2023-07-20 12:07:39   pig
133220 20/07/2023 12:07:39.380 2023-07-20 12:07:39   pig
133221 20/07/2023 12:07:39.400 2023-07-20 12:07:39   pig

Here are the data inputs:

DF1

#new dataframe for testing#

START<-c("2023-07-20 14:07:39.01", "2023-07-20 14:07:39.29")

START<- as.POSIXct(START, "%Y-%m-%d %H:%M:%OS",tz = "UTC")

START<-format(START, "%Y-%m-%d %H:%M:%OS3")

STOP<-c("2023-07-20 14:07:39.14","2023-07-20 14:07:39.90")

STOP<- as.POSIXct(STOP, "%Y-%m-%d %H:%M:%OS",tz = "UTC")

STOP<-format(STOP, "%Y-%m-%d %H:%M:%OS2")

behavior<-c("dig", "pig")

DF1<-data.frame(START, STOP, behavior)

DF1$START<- as.POSIXct(DF1$START, "%Y-%m-%d %H:%M:%OS",tz = "UTC")

DF1$STOP<-as.POSIXct(DF1$STOP, "%Y-%m-%d %H:%M:%OS",tz = "UTC")

DF1$START<-format(DF1$START, "%Y-%m-%d %H:%M:%OS2")

DF1$STOP<-format(DF1$STOP, "%Y-%m-%d %H:%M:%OS2")

DF2 (x is time shown with ms, using format(df2$x, "%Y-%m-%d %H:%M:%OS2")

structure(list(x = c("20/07/2023 12:07:39.020", "20/07/2023 12:07:39.040", 
"20/07/2023 12:07:39.060", "20/07/2023 12:07:39.080", "20/07/2023 12:07:39.100", 
"20/07/2023 12:07:39.120", "20/07/2023 12:07:39.140", "20/07/2023 12:07:39.160", 
"20/07/2023 12:07:39.180", "20/07/2023 12:07:39.200", "20/07/2023 12:07:39.220", 
"20/07/2023 12:07:39.240", "20/07/2023 12:07:39.260", "20/07/2023 12:07:39.280", 
"20/07/2023 12:07:39.300", "20/07/2023 12:07:39.320", "20/07/2023 12:07:39.340", 
"20/07/2023 12:07:39.360", "20/07/2023 12:07:39.380", "20/07/2023 12:07:39.400"
), time = structure(c(1689854859.02, 1689854859.04, 1689854859.06, 
1689854859.08, 1689854859.1, 1689854859.12, 1689854859.14, 1689854859.16, 
1689854859.18, 1689854859.2, 1689854859.22, 1689854859.24, 1689854859.26, 
1689854859.28, 1689854859.3, 1689854859.32, 1689854859.34, 1689854859.36, 
1689854859.38, 1689854859.4), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    label = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA)), row.names = 133202:133221, class = "data.frame")

I want to produce this:

 dput(df2)
structure(list(x = c("20/07/2023 12:07:39.020", "20/07/2023 12:07:39.040", 
"20/07/2023 12:07:39.060", "20/07/2023 12:07:39.080", "20/07/2023 12:07:39.100", 
"20/07/2023 12:07:39.120", "20/07/2023 12:07:39.140", "20/07/2023 12:07:39.160", 
"20/07/2023 12:07:39.180", "20/07/2023 12:07:39.200", "20/07/2023 12:07:39.220", 
"20/07/2023 12:07:39.240", "20/07/2023 12:07:39.260", "20/07/2023 12:07:39.280", 
"20/07/2023 12:07:39.300", "20/07/2023 12:07:39.320", "20/07/2023 12:07:39.340", 
"20/07/2023 12:07:39.360", "20/07/2023 12:07:39.380", "20/07/2023 12:07:39.400"
), time = structure(c(1689854859.02, 1689854859.04, 1689854859.06, 
1689854859.08, 1689854859.1, 1689854859.12, 1689854859.14, 1689854859.16, 
1689854859.18, 1689854859.2, 1689854859.22, 1689854859.24, 1689854859.26, 
1689854859.28, 1689854859.3, 1689854859.32, 1689854859.34, 1689854859.36, 
1689854859.38, 1689854859.4), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    label = c("dig", "dig", "dig", "dig", "dig", "dig", "dig", 
    NA, NA, NA, NA, NA, NA, "pig", "pig", "pig", "pig", "pig", 
    "pig", "pig")), row.names = 133202:133221, class = "data.frame")

I can produce the desired result using the following code:

for (i in 1:nrow(df2)) {
  time_val <- df2$time[i]
  # Find the row where time_val falls within the interval
  interval_row <- which(time_val >= df1$START & time_val <= df1$STOP)
  
  # If there's a match, replace NA in label with the corresponding behavior
  if (length(interval_row) > 0) {
    behavior_val <- df1$behavior[interval_row]
    df2$label[i] <- behavior_val
  }
}

with the dummy data for DF1.

When I try to run using my full DF1 it doesn't seem to be finding the time intervals (throwing the error "Error in df2$label[i] <- behavior_val : replacement has length zero", even though the str() output is identical. I can't work out why. I have also noticed that this seems to only work when I use the columns of DF1 when they are format(DF1$STOP, "%Y-%m-%d %H:%M:%OS2") Not when they are formatted as POSIXct, which makes me think this is a very made way to do what I think is a simple problem!

I also think this is a messy solution and the for loop takes a while to run. Is there a better way to get my desired results and still accounting for the milliseconds in the time intervals and data?

I started to look into data.table solutions, but it doesn't sound like this is great to use for time intervals with milliseconds?

Please help!

Thank you so much in advance!


Solution

  • If I understand you correctly, you would like to merge behavior to DF2 depending on whether x falls into the interval defined by START and STOP?

    Below, I assume that the time in DF1 is not 14h but 12h:

    library(tidyverse)
    
    DF1 <- as_tibble(DF1) |> 
      mutate(across(c(START, STOP), ymd_hms))
    
    DF2 <- as_tibble(DF2) |> 
      mutate(x = dmy_hms(x))
    
    DF2 |> 
      left_join(DF1, join_by(between(x, START, STOP))) |> 
      select(x, time, behavior)
    #> # A tibble: 20 × 3
    #>    x                   time                behavior
    #>    <dttm>              <dttm>              <chr>   
    #>  1 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
    #>  2 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
    #>  3 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
    #>  4 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
    #>  5 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
    #>  6 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
    #>  7 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
    #>  8 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
    #>  9 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
    #> 10 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
    #> 11 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
    #> 12 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
    #> 13 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
    #> 14 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
    #> 15 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
    #> 16 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
    #> 17 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
    #> 18 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
    #> 19 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
    #> 20 2023-07-20 12:07:39 2023-07-20 12:07:39 pig
    

    Created on 2023-12-08 with reprex v2.0.2