rtidyverse

Identifying observations that overlap in space and time


I have a data frame were each row is a unique observation.

Observations overlap in time if they are located within a specified temporal distance (e.g. 30 days) of one another.

Observations overlap in space if they are located within a specified spatial distance (e.g. 20 kilometers) of one another.

I am working with the collections of observations that overlap in both time and space. I want to make a column (overlaps) that contains vectors with the ids of the observations that overlap with an observation. I have tried the solution below, but the run time is too poor for the solution to be applicable.

library(dplyr)
library(lubridate)
library(purrr)
library(geosphere)

spat_proximity <- function(x, y, z) {
  
  return(which(map_dbl(y, ~ distGeo(., x)) <= z))}


temp_proximity <- function(x, y, z) {
    
  return(which(map_dbl(y, ~ abs(x - .)) <= z))}


test %>%
  mutate(overlaps = map2(map(place, ~ spat_proximity(., place, 20000)),
                         map(time, ~ temp_proximity(., time, 30)),
                         ~ intersect(.x, .y)))

How can I speed things up?

Desired output


structure(list(id = 1:42, time = structure(c(1478601762, 1475170279, 
1469770219, 1462441336, 1474739469, 1488216507, 1475203721, 1468705558, 
1481722718, 1485897197, 1488669576, 1501288618, 1510266595, 1516828588, 
1497048175, 1516546144, 1507576242, 1517654363, 1496070298, 1519765220, 
1507408104, 1532046710, 1542196446, 1534747170, 1533605231, 1521381844, 
1545389880, 1537988628, 1544304998, 1524842149, 1551051077, 1540822870, 
1579775599, 1580337175, 1551486497, 1554879837, 1568620434, 1568701543, 
1556387550, 1561253396, 1582925482, 1562166384), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), place = list(c(7.59729413351368, 52.6052275122351
), c(9.99728447956781, 53.43773657253), c(10.1114473929533, 53.1295890148866
), c(7.74115218835801, 53.555354690339), c(9.82895066827581, 
53.1009319396015), c(10.061107415855, 53.1908752763309), c(10.1134381934544, 
53.1450558612239), c(8.59001735546083, 53.1767797285482), c(6.43939168487555, 
52.5520931654252), c(8.38811111096636, 53.9043055557574), c(6.20061916537948, 
52.462037409576), c(8.66656282486832, 52.8269702466929), c(9.92127490588442, 
53.1240045666796), c(9.77810957468704, 53.1445777603789), c(10.0972382106036, 
53.1604265989175), c(10.0473952445094, 53.1698097395641), c(9.23773401919961, 
53.2120381900218), c(8.29524237837988, 52.822332696399), c(6.63690696797941, 
53.4436726627048), c(6.89839325296288, 53.947454203445), c(6.97064542834721, 
54.2487197094445), c(9.98865072631714, 53.4088944299342), c(9.94164401569524, 
53.1500576073959), c(9.64242996587752, 52.9285470044703), c(10.1026940185685, 
53.1635394335485), c(9.94874529044194, 53.2202512735354), c(8.8025526552284, 
53.2423093779114), c(7.93352467761445, 52.9129105531343), c(6.6418846001424, 
53.2459031608081), c(7.56102465003101, 53.5306444680171), c(7.36619114998468, 
53.748869508885), c(7.40993284414052, 54.5367797663042), c(9.90022663895919, 
53.3726361099083), c(9.41110555596208, 52.5001044709056), c(10.1151193231519, 
53.1539029361817), c(10.1064400828529, 53.1793449776572), c(9.94235711256256, 
53.2622041055899), c(9.44215997717822, 53.4799339987572), c(7.03832846889284, 
53.1986115213435), c(7.32755360272354, 53.416700338513), c(7.57828611098173, 
53.6107027769073), c(7.55411005022882, 54.1905803935834)), overlaps = list(
    1L, 2L, 3L, 4L, c(5L, 7L), 6L, c(5L, 7L), 8L, 9L, 10L, 11L, 
    12L, 13L, c(14L, 16L), 15L, c(14L, 16L), 17L, 18L, 19L, 20L, 
    21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 
    33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L)), row.names = c(NA, 
-42L), class = c("tbl_df", "tbl", "data.frame"))

Data

structure(list(id = 1:42, time = structure(c(1478601762, 1475170279, 
1469770219, 1462441336, 1474739469, 1488216507, 1475203721, 1468705558, 
1481722718, 1485897197, 1488669576, 1501288618, 1510266595, 1516828588, 
1497048175, 1516546144, 1507576242, 1517654363, 1496070298, 1519765220, 
1507408104, 1532046710, 1542196446, 1534747170, 1533605231, 1521381844, 
1545389880, 1537988628, 1544304998, 1524842149, 1551051077, 1540822870, 
1579775599, 1580337175, 1551486497, 1554879837, 1568620434, 1568701543, 
1556387550, 1561253396, 1582925482, 1562166384), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), place = list(c(7.59729413351368, 52.6052275122351
), c(9.99728447956781, 53.43773657253), c(10.1114473929533, 53.1295890148866
), c(7.74115218835801, 53.555354690339), c(9.82895066827581, 
53.1009319396015), c(10.061107415855, 53.1908752763309), c(10.1134381934544, 
53.1450558612239), c(8.59001735546083, 53.1767797285482), c(6.43939168487555, 
52.5520931654252), c(8.38811111096636, 53.9043055557574), c(6.20061916537948, 
52.462037409576), c(8.66656282486832, 52.8269702466929), c(9.92127490588442, 
53.1240045666796), c(9.77810957468704, 53.1445777603789), c(10.0972382106036, 
53.1604265989175), c(10.0473952445094, 53.1698097395641), c(9.23773401919961, 
53.2120381900218), c(8.29524237837988, 52.822332696399), c(6.63690696797941, 
53.4436726627048), c(6.89839325296288, 53.947454203445), c(6.97064542834721, 
54.2487197094445), c(9.98865072631714, 53.4088944299342), c(9.94164401569524, 
53.1500576073959), c(9.64242996587752, 52.9285470044703), c(10.1026940185685, 
53.1635394335485), c(9.94874529044194, 53.2202512735354), c(8.8025526552284, 
53.2423093779114), c(7.93352467761445, 52.9129105531343), c(6.6418846001424, 
53.2459031608081), c(7.56102465003101, 53.5306444680171), c(7.36619114998468, 
53.748869508885), c(7.40993284414052, 54.5367797663042), c(9.90022663895919, 
53.3726361099083), c(9.41110555596208, 52.5001044709056), c(10.1151193231519, 
53.1539029361817), c(10.1064400828529, 53.1793449776572), c(9.94235711256256, 
53.2622041055899), c(9.44215997717822, 53.4799339987572), c(7.03832846889284, 
53.1986115213435), c(7.32755360272354, 53.416700338513), c(7.57828611098173, 
53.6107027769073), c(7.55411005022882, 54.1905803935834))), row.names = c(NA, 
-42L), class = c("tbl_df", "tbl", "data.frame"))

Solution

  • If you truly want speed you can write your own C++ code to calculate the distance (because geosphere is quite slow) and time comparisons


    Example

    Save this code in a file, for example "~/Desktop/find_overlaps.cpp"

    you need to install Rcpp - install.packages("Rcpp")

    
    
    #include "Rcpp.h"
    #include <math.h>
    
    static const double earth = 6378137.0; // WSG-84 definition
    
    // haversine formula taken from the geodist library
    // - https://github.com/hypertidy/geodist
    double distance_haversine(double x1, double y1, double x2, double y2) {
    
      double cosy1 = cos( y1 * M_PI / 180.0 );
      double cosy2 = cos( y2 * M_PI / 180.0 );
    
      double sxd = sin ((x2 - x1) * M_PI / 360.0);
      double syd = sin ((y2 - y1) * M_PI / 360.0);
      double d = syd * syd + cosy1 * cosy2 * sxd * sxd;
      d = 2.0 * earth * asin (sqrt (d));
      return (d);
    }
    
    // returns true if second date is within 30 days of the first
    bool within_days( int first_date, int second_date ) {
      int days = 30 * 24 * 60 * 60;
      int lower_bound = first_date - days;
      int upper_bound = first_date + days;
    
      return lower_bound <= second_date && second_date <= upper_bound;
    }
    
    bool within_distance( Rcpp::NumericVector start_place, Rcpp::NumericVector end_place, double distance_limit = 20000.0 ) {
    
      double x1 = start_place[0];
      double y1 = start_place[1];
      double x2 = end_place[0];
      double y2 = end_place[1];
    
      return distance_haversine(x1, y1, x2, y2) <= distance_limit;
    }
    
    // [[Rcpp::export]]
    SEXP find_overlaps( Rcpp::NumericVector ids, Rcpp::IntegerVector dates, Rcpp::List place ) {
    
      R_xlen_t n = dates.length();
      R_xlen_t i, j;
    
      Rcpp::List res( n );
    
      R_xlen_t result_counter;
      for( i = 0; i < n; ++i ) {
    
        Rcpp::IntegerVector overlaps( n ); // initialise vector to store results
        result_counter = 0;
    
        for( j = 0; j < n; ++j ) {
          // ignore self-comparisons
          if( i != j ) {
    
            int first_date = dates[ i ];
            int second_date = dates[ j ];
    
            Rcpp::NumericVector first_place = place[ i ];
            Rcpp::NumericVector second_place = place[ j ];
    
            // check the place values exist
            if( first_place.length() != 2 || second_place.length() != 2 ) {
              continue;
            }
    
            if( within_days( first_date, second_date) && within_distance( first_place, second_place ) ) {
              overlaps[ result_counter ] = j;
              result_counter++;
            }
          }
    
          if( result_counter > 0 ) {
            Rcpp::IntegerVector id_idx = overlaps[ Rcpp::Range( 0, result_counter - 1 ) ];
            res[ i ] = ids[ id_idx ];
          }
    
        }
      }
      return res;
    }
    
    
    

    Then source it in R

    library(Rcpp)
    
    Rcpp::sourceCpp(file = "~/Desktop/find_overlaps.cpp")
    
    res <- find_overlaps( df$id, df$time, df$place )
    
    df$overlaps <- res
    
    df
    #    id                time               place overlaps
    # 1   1 2016-11-08 10:42:42 7.597294, 52.605228     NULL
    # 2   2 2016-09-29 17:31:19 9.997284, 53.437737     NULL
    # 3   3 2016-07-29 05:30:19  10.11145, 53.12959     NULL
    # 4   4 2016-05-05 09:42:16 7.741152, 53.555355     NULL
    # 5   5 2016-09-24 17:51:09 9.828951, 53.100932        7
    # 6   6 2017-02-27 17:28:27  10.06111, 53.19088     NULL
    # 7   7 2016-09-30 02:48:41  10.11344, 53.14506        5
    # 8   8 2016-07-16 21:45:58 8.590017, 53.176780     NULL
    # 9   9 2016-12-14 13:38:38 6.439392, 52.552093     NULL
    # 10 10 2017-01-31 21:13:17 8.388111, 53.904306     NULL
    # 11 11 2017-03-04 23:19:36 6.200619, 52.462037     NULL
    # 12 12 2017-07-29 00:36:58 8.666563, 52.826970     NULL
    # 13 13 2017-11-09 22:29:55 9.921275, 53.124005     NULL
    # 14 14 2018-01-24 21:16:28   9.77811, 53.14458       16
    # 15 15 2017-06-09 22:42:55  10.09724, 53.16043     NULL
    # 16 16 2018-01-21 14:49:04  10.04740, 53.16981       14
    # 17 17 2017-10-09 19:10:42 9.237734, 53.212038     NULL
    # 18 18 2018-02-03 10:39:23 8.295242, 52.822333     NULL
    # 19 19 2017-05-29 15:04:58 6.636907, 53.443673     NULL
    # 20 20 2018-02-27 21:00:20 6.898393, 53.947454     NULL
    # ...
    

    Notes