I'm working with origin-destination (OD) data that contains an origin ID and a destination ID in separate columns. Sometimes it's important to aggregate OD pairs that are the same, except that the origin and destination is swapped.
OD data looks like this:
orign dest value
E02002361 E02002361 109
E02002361 E02002363 38
E02002361 E02002367 10
E02002361 E02002371 44
E02002363 E02002361 34
In the above example, the first and the last rows can be considered as the same pair, except in the opposite direction. The challenge is how to efficiently identify that they are duplicates.
I created a package, stplanr, that can answer this question, as illustrated in the reproducible example below:
x = read.csv(stringsAsFactors = FALSE, text = "orign,dest,value
E02002361,E02002361,109
E02002361,E02002363,38
E02002361,E02002367,10
E02002361,E02002371,44
E02002363,E02002361,34")
duplicated(stplanr::od_id_order(x)[[3]])
#> Registered S3 method overwritten by 'R.oo':
#> method from
#> throw.default R.methodsS3
#> [1] FALSE FALSE FALSE FALSE TRUE
Created on 2019-07-27 by the reprex package (v0.3.0)
The problem with this approach is that it's slow for large datasets.
I've looked at fastest way to get Min from every column in a matrix? which suggests that pmin
is the most efficient way to get the minimum value across multiple columns (not 2), and we're already using this.
Unlike Removing duplicate combinations (irrespective of order) this question is about duplicate identification on only 2 columns and efficiency. The solution posted in Removing duplicate combinations (irrespective of order) seems to be slower than the slowest solution shown in the timings below.
A solution that is faster than this is the szudzik_pairing
function, created by my colleague Malcolm Morgan and based on a method developed by Matthew Szudzik.
We've tried each approach and the Szudzik approach does indeed seem faster but I'm wondering: is there a more efficient way (in any language, but preferably implementable in R)?
Here is a quick reproducible example of what we've done, including a simple benchmark showing timings:
od_id_order_base <- function(x, id1 = names(x)[1], id2 = names(x)[2]) {
data.frame(
stringsAsFactors = FALSE,
stplanr.id1 = x[[id1]],
stplanr.id1 = x[[id2]],
stplanr.key = paste(
pmin(x[[id1]], x[[id2]]),
pmax(x[[id1]], x[[id2]])
)
)
}
od_id_order_rfast <- function(x, id1 = names(x)[1], id2 = names(x)[2]) {
data.frame(
stringsAsFactors = FALSE,
stplanr.id1 = x[[id1]],
stplanr.id1 = x[[id2]],
stplanr.key = paste(
Rfast::colPmin(as.numeric(as.factor(x[[id1]])), as.numeric(as.factor(x[[id1]]))),
pmax(x[[id1]], x[[id2]])
)
)
}
od_id_order_dplyr <- function(x, id1 = names(x)[1], id2 = names(x)[2]) {
dplyr::transmute_(x,
stplanr.id1 = as.name(id1),
stplanr.id2 = as.name(id2),
stplanr.key = ~paste(pmin(stplanr.id1, stplanr.id2), pmax(stplanr.id1, stplanr.id2))
)
}
szudzik_pairing <- function(val1, val2, ordermatters = FALSE) {
if(length(val1) != length(val2)){
stop("val1 and val2 are not of equal length")
}
if(class(val1) == "factor"){
val1 <- as.character(val1)
}
if(class(val2) == "factor"){
val2 <- as.character(val2)
}
lvls <- unique(c(val1, val2))
val1 <- as.integer(factor(val1, levels = lvls))
val2 <- as.integer(factor(val2, levels = lvls))
if(ordermatters){
ismax <- val1 > val2
stplanr.key <- (ismax * 1) * (val1^2 + val1 + val2) + ((!ismax) * 1) * (val2^2 + val1)
}else{
a <- ifelse(val1 > val2, val2, val1)
b <- ifelse(val1 > val2, val1, val2)
stplanr.key <- b^2 + a
}
return(stplanr.key)
}
n = 1000
ids <- as.character(runif(n, 1e4, 1e7 - 1))
x <- data.frame(id1 = rep(ids, times = n),
id2 = rep(ids, each = n),
val = 1,
stringsAsFactors = FALSE)
head(od_id_order_base(x))
#> stplanr.id1 stplanr.id1.1 stplanr.key
#> 1 8515501.50763425 8515501.50763425 8515501.50763425 8515501.50763425
#> 2 2454738.52108038 8515501.50763425 2454738.52108038 8515501.50763425
#> 3 223811.25236322 8515501.50763425 223811.25236322 8515501.50763425
#> 4 4882305.41496906 8515501.50763425 4882305.41496906 8515501.50763425
#> 5 4663684.5752892 8515501.50763425 4663684.5752892 8515501.50763425
#> 6 725621.968830239 8515501.50763425 725621.968830239 8515501.50763425
head(od_id_order_rfast(x))
#> stplanr.id1 stplanr.id1.1 stplanr.key
#> 1 8515501.50763425 8515501.50763425 830 8515501.50763425
#> 2 2454738.52108038 8515501.50763425 163 8515501.50763425
#> 3 223811.25236322 8515501.50763425 135 8515501.50763425
#> 4 4882305.41496906 8515501.50763425 435 8515501.50763425
#> 5 4663684.5752892 8515501.50763425 408 8515501.50763425
#> 6 725621.968830239 8515501.50763425 689 8515501.50763425
head(od_id_order_dplyr(x))
#> Warning: transmute_() is deprecated.
#> Please use transmute() instead
#>
#> The 'programming' vignette or the tidyeval book can help you
#> to program with transmute() : https://tidyeval.tidyverse.org
#> This warning is displayed once per session.
#> stplanr.id1 stplanr.id2 stplanr.key
#> 1 8515501.50763425 8515501.50763425 8515501.50763425 8515501.50763425
#> 2 2454738.52108038 8515501.50763425 2454738.52108038 8515501.50763425
#> 3 223811.25236322 8515501.50763425 223811.25236322 8515501.50763425
#> 4 4882305.41496906 8515501.50763425 4882305.41496906 8515501.50763425
#> 5 4663684.5752892 8515501.50763425 4663684.5752892 8515501.50763425
#> 6 725621.968830239 8515501.50763425 725621.968830239 8515501.50763425
head(szudzik_pairing(x$id1, x$id2))
#> [1] 2 5 10 17 26 37
system.time(od_id_order_base(x))
#> user system elapsed
#> 0.467 0.000 0.467
system.time(od_id_order_rfast(x))
#> user system elapsed
#> 1.063 0.001 1.064
system.time(od_id_order_dplyr(x))
#> user system elapsed
#> 0.493 0.000 0.493
system.time(szudzik_pairing(x$id1, x$id2))
#> user system elapsed
#> 0.100 0.000 0.101
Created on 2019-07-27 by the reprex package (v0.3.0)
devtools::session_info()
#> ─ Session info ──────────────────────────────────────────────────────────
#> setting value
#> version R version 3.6.0 (2019-04-26)
#> os Debian GNU/Linux 9 (stretch)
#> system x86_64, linux-gnu
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Etc/UTC
#> date 2019-07-27
#>
#> ─ Packages ──────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
#> backports 1.1.4 2019-04-10 [1] CRAN (R 3.6.0)
#> callr 3.3.0 2019-07-04 [1] CRAN (R 3.6.0)
#> cli 1.1.0 2019-03-19 [1] CRAN (R 3.6.0)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 3.6.0)
#> devtools 2.1.0 2019-07-06 [1] CRAN (R 3.6.0)
#> digest 0.6.20 2019-07-04 [1] CRAN (R 3.6.0)
#> dplyr 0.8.3 2019-07-04 [1] CRAN (R 3.6.0)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
#> fs 1.3.1 2019-05-06 [1] CRAN (R 3.6.0)
#> glue 1.3.1 2019-03-12 [1] CRAN (R 3.6.0)
#> highr 0.8 2019-03-20 [1] CRAN (R 3.6.0)
#> htmltools 0.3.6 2017-04-28 [1] CRAN (R 3.6.0)
#> knitr 1.23 2019-05-18 [1] CRAN (R 3.6.0)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
#> memoise 1.1.0 2017-04-21 [1] CRAN (R 3.6.0)
#> pillar 1.4.2 2019-06-29 [1] CRAN (R 3.6.0)
#> pkgbuild 1.0.3 2019-03-20 [1] CRAN (R 3.6.0)
#> pkgconfig 2.0.2 2018-08-16 [1] CRAN (R 3.6.0)
#> pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.6.0)
#> prettyunits 1.0.2 2015-07-13 [1] CRAN (R 3.6.0)
#> processx 3.4.0 2019-07-03 [1] CRAN (R 3.6.0)
#> ps 1.3.0 2018-12-21 [1] CRAN (R 3.6.0)
#> purrr 0.3.2 2019-03-15 [1] CRAN (R 3.6.0)
#> R6 2.4.0 2019-02-14 [1] CRAN (R 3.6.0)
#> Rcpp 1.0.1 2019-03-17 [1] CRAN (R 3.6.0)
#> RcppZiggurat 0.1.5 2018-06-10 [1] CRAN (R 3.6.0)
#> remotes 2.1.0 2019-06-24 [1] CRAN (R 3.6.0)
#> Rfast 1.9.4 2019-05-25 [1] CRAN (R 3.6.0)
#> rlang 0.4.0 2019-06-25 [1] CRAN (R 3.6.0)
#> rmarkdown 1.13 2019-05-22 [1] CRAN (R 3.6.0)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.6.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
#> stringi 1.4.3 2019-03-12 [1] CRAN (R 3.6.0)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
#> testthat 2.1.1 2019-04-23 [1] CRAN (R 3.6.0)
#> tibble 2.1.3 2019-06-06 [1] CRAN (R 3.6.0)
#> tidyselect 0.2.5 2018-10-11 [1] CRAN (R 3.6.0)
#> usethis 1.5.1 2019-07-04 [1] CRAN (R 3.6.0)
#> withr 2.1.2 2018-03-15 [1] CRAN (R 3.6.0)
#> xfun 0.8 2019-06-25 [1] CRAN (R 3.6.0)
#> yaml 2.2.0 2018-07-25 [1] CRAN (R 3.6.0)
#>
#> [1] /usr/local/lib/R/site-library
#> [2] /usr/local/lib/R/library
Here are four possible approaches to generate unique keys from tuples (x, y) where order doesn't matter: od_id_order_base
and szudzik_pairing
, per OP's question; a modified Szudzik method szudzik_pairing_alt
; and a method max_min
that uses the formula shown in this answer.
convert_to_numeric <- function(x, y) {
if (length(x) != length(y)) stop("x and y are not of equal length")
if (class(x) == "factor") x <- as.character(x)
if (class(y) == "factor") y <- as.character(y)
lvls <- unique(c(x, y))
x <- as.integer(factor(x, levels = lvls))
y <- as.integer(factor(y, levels = lvls))
list(x = x, y = y)
}
od_id_order_base <- function(x, y) {
d <- convert_to_numeric(x, y)
x <- d$x
y <- d$y
paste(pmin(x, y), pmax(x, y))
}
szudzik_pairing <- function(x, y) {
d <- convert_to_numeric(x, y)
x <- d$x
y <- d$y
a <- ifelse(x > y, y, x)
b <- ifelse(x > y, x, y)
b^2 + a
}
szudzik_pairing_alt <- function(x, y) {
d <- convert_to_numeric(x, y)
x <- d$x
y <- d$y
z <- y^2 + x
ifelse(y < x, x^2 + y, z)
}
max_min <- function(x, y) {
d <- convert_to_numeric(x, y)
x <- d$x
y <- d$y
a <- pmax(x, y)
b <- pmin(x, y)
a * (a + 1) / 2 + b
}
Generate the keys from some sample data and verify that we get the same results:
check_dupe <- function(f, x, y) duplicated(f(x, y))
set.seed(123)
n <- 1000^2
x <- ceiling(runif(n) * 1000)
y <- ceiling(runif(n) * 1000)
p <- lapply(list(od_id_order_base, szudzik_pairing, szudzik_pairing_alt,
max_min), check_dupe, x, y)
all(sapply(p[-1], function(x) identical(p[[1]], as.vector(x))))
# [1] TRUE
Benchmarking:
bmk <- microbenchmark::microbenchmark(
p1 = check_dupe(od_id_order_base, x, y),
p2 = check_dupe(szudzik_pairing, x, y),
p3 = check_dupe(szudzik_pairing_alt, x, y),
p4 = check_dupe(max_min, x, y),
times = 100
)
# Unit: seconds
# expr min lq mean median uq max neval cld
# p1 2.958512 3.089615 3.336621 3.336915 3.474973 4.721378 100 b
# p2 1.934742 2.058185 2.191331 2.190588 2.306203 2.983729 100 a
# p3 1.889201 1.990306 2.173845 2.138995 2.259218 5.186751 100 a
# p4 1.870261 1.980756 2.143026 2.145458 2.234580 3.111324 100 a