joindplyrnaoverlapping-matches

Overlapping join with dplyr when reference tibble contains NA / missing values


I'd like to join two dataframes: reference data, refdata, containing data describing when a person was wearing a collar; and movedata, containing data describing where a collar was at a given time. Collars can be moved from person to person, and if a collar is still on, there is no timeend defined for that collar.

Here's some example data and one of the joins I've tried:

library(dplyr)

refdata <- tibble(person = c("Jess", "Andy", "Cody"),
                  collar = c("a", "b", "a"),
                  timestart = c(1, 2, 5),
                  timeend = c(3, 4, NA))

movedata <- tibble(collar = c("a", "a", "a", "a", "a", "a", "a", 
                              "b", "b", "b", "b", "b", "b", "b", "c", "c", "c"), 
                   time = c(1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3), 
          location = c("skatepark", "home", "library", "street", "crib", "crib", "bassinet", "street", "work", "work", "home", "street", "street", "street", "street", "street", "street"))

by <- join_by(collar,
              between(time, timestart, timeend))
check <- full_join(movedata, refdata, by)

# View input data
refdata
# A tibble: 3 × 4
  person collar timestart timeend
  <chr>  <chr>      <dbl>   <dbl>
1 Jess   a              1       3
2 Andy   b              2       4
3 Cody   a              5      NA

movedata
# A tibble: 17 × 3
   collar  time location 
   <chr>  <dbl> <chr>    
 1 a          1 skatepark
 2 a          2 home     
 3 a          3 library  
 4 a          4 street   
 5 a          5 crib     
 6 a          6 crib     
 7 a          7 bassinet 
 8 b          1 street   
 9 b          2 work     
10 b          3 work     
11 b          4 home     
12 b          5 street   
13 b          6 street   
14 b          7 street   
15 c          1 street   
16 c          2 street   
17 c          3 street

# result of join (not desired output):
check 
# A tibble: 18 × 6
   collar  time location  person timestart timeend
   <chr>  <dbl> <chr>     <chr>      <dbl>   <dbl>
 1 a          1 skatepark Jess           1       3
 2 a          2 home      Jess           1       3
 3 a          3 library   Jess           1       3
 4 a          4 street    NA            NA      NA
 5 a          5 crib      NA            NA      NA
 6 a          6 crib      NA            NA      NA
 7 a          7 bassinet  NA            NA      NA
 8 b          1 street    NA            NA      NA
 9 b          2 work      Andy           2       4
10 b          3 work      Andy           2       4
11 b          4 home      Andy           2       4
12 b          5 street    NA            NA      NA
13 b          6 street    NA            NA      NA
14 b          7 street    NA            NA      NA
15 c          1 street    NA            NA      NA
16 c          2 street    NA            NA      NA
17 c          3 street    NA            NA      NA
18 a         NA NA        Cody           5      NA

but my desired result would show that Cody has worn, and is still wearing, the collar:

# Expected result
# A tibble: 17 × 6
   collar  time location  person timestart timeend
   <chr>  <dbl> <chr>     <chr>      <dbl>   <dbl>
 1 a          1 skatepark Jess           1       3
 2 a          2 home      Jess           1       3
 3 a          3 library   Jess           1       3
 4 a          4 street    NA             5      NA
 5 a          5 crib      Cody           5      NA
 6 a          6 crib      Cody           5      NA
 7 a          7 bassinet  Cody           5      NA
 8 b          1 street    NA            NA      NA
 9 b          2 work      Andy           2       4
10 b          3 work      Andy           2       4
11 b          4 home      Andy           2       4
12 b          5 street    NA            NA      NA
13 b          6 street    NA            NA      NA
14 b          7 street    NA            NA      NA
15 c          1 street    NA            NA      NA
16 c          2 street    NA            NA      NA
17 c          3 street    NA            NA      NA

I'd be interested in two solutions: one that creates the expected table above, and one that removes any rows that are not associated with a person (so rows 8 & 12-17, above).


Solution

  • One way would be to replace the NAs in timeend with Inf:

    refdata$timeend <- replace(refdata$timeend, is.na(refdata$timeend), Inf)
    by <- join_by(collar,
                  between(time, timestart, timeend))
    full_join(movedata, refdata, by)
    
    # # A tibble: 17 × 6
    #    collar  time location  person timestart timeend
    #    <chr>  <dbl> <chr>     <chr>      <dbl>   <dbl>
    #  1 a          1 skatepark Jess           1       3
    #  2 a          2 home      Jess           1       3
    #  3 a          3 library   Jess           1       3
    #  4 a          4 street    NA            NA      NA
    #  5 a          5 crib      Cody           5     Inf
    #  6 a          6 crib      Cody           5     Inf
    #  7 a          7 bassinet  Cody           5     Inf
    #  8 b          1 street    NA            NA      NA
    #  9 b          2 work      Andy           2       4
    # 10 b          3 work      Andy           2       4
    # 11 b          4 home      Andy           2       4
    # 12 b          5 street    NA            NA      NA
    # 13 b          6 street    NA            NA      NA
    # 14 b          7 street    NA            NA      NA
    # 15 c          1 street    NA            NA      NA
    # 16 c          2 street    NA            NA      NA
    # 17 c          3 street    NA            NA      NA
    

    For your second expected output, check inner_join:

    inner_join(movedata, refdata, by)
    
    # # A tibble: 9 × 6
    #   collar  time location  person timestart timeend
    #   <chr>  <dbl> <chr>     <chr>      <dbl>   <dbl>
    # 1 a          1 skatepark Jess           1       3
    # 2 a          2 home      Jess           1       3
    # 3 a          3 library   Jess           1       3
    # 4 a          5 crib      Cody           5     Inf
    # 5 a          6 crib      Cody           5     Inf
    # 6 a          7 bassinet  Cody           5     Inf
    # 7 b          2 work      Andy           2       4
    # 8 b          3 work      Andy           2       4
    # 9 b          4 home      Andy           2       4