I have downloaded a list of all the towns and cities etc in the US from the census bureau. Here is a random sample:
dput(somewhere)
structure(list(state = structure(c(30L, 31L, 5L, 31L, 24L, 36L,
13L, 21L, 6L, 10L, 31L, 28L, 10L, 5L, 5L, 8L, 23L, 11L, 34L,
19L, 29L, 4L, 24L, 13L, 21L, 31L, 2L, 3L, 29L, 24L, 1L, 13L,
15L, 10L, 11L, 33L, 35L, 8L, 11L, 12L, 36L, 28L, 9L, 31L, 8L,
14L, 11L, 12L, 36L, 13L, 8L, 5L, 29L, 8L, 7L, 23L, 25L, 39L,
16L, 28L, 10L, 29L, 26L, 8L, 32L, 40L, 28L, 23L, 37L, 31L, 18L,
5L, 1L, 31L, 18L, 13L, 11L, 10L, 25L, 18L, 21L, 18L, 11L, 35L,
31L, 36L, 20L, 19L, 38L, 2L, 40L, 13L, 36L, 11L, 29L, 27L, 22L,
17L, 12L, 20L), .Label = c("ak", "al", "ar", "az", "ca", "co",
"fl", "ga", "hi", "ia", "il", "in", "ks", "ky", "la", "md", "mi",
"mo", "ms", "mt", "nc", "nd", "ne", "nj", "nm", "nv", "ny", "oh",
"ok", "or", "pa", "pr", "ri", "sc", "sd", "tx", "ut", "va", "wa",
"wi"), class = "factor"), geoid = c(4120100L, 4280962L, 668028L,
4243944L, 3460600L, 4871948L, 2046000L, 3747695L, 839965L, 1909910L,
4244824L, 3902204L, 1963750L, 622360L, 669088L, 1382972L, 3125230L,
1722736L, 4539265L, 2804705L, 4039625L, 451465L, 3467020L, 2077150L,
3765260L, 4221792L, 133904L, 566320L, 4033150L, 3463180L, 223460L,
2013675L, 2232405L, 1951600L, 1752142L, 4445010L, 4655684L, 1336416L,
1729080L, 1840842L, 4804672L, 3932207L, 1523675L, 4260384L, 1321912L,
2159232L, 1735307L, 1867176L, 4839304L, 2057350L, 1309656L, 655380L,
4082250L, 1350680L, 1275475L, 3147745L, 3505010L, 5352285L, 2483337L,
3909834L, 1912945L, 4068200L, 3227900L, 1366304L, 7286831L, 5505350L,
3982390L, 3149915L, 4974480L, 4249440L, 2943346L, 677430L, 280770L,
4247872L, 2902242L, 2039075L, 1735281L, 1932565L, 3580120L, 2973852L,
3722620L, 2943238L, 1755938L, 4643100L, 4251904L, 4830920L, 3056575L,
2801940L, 5156048L, 137000L, 5508925L, 2057300L, 4861172L, 1736477L,
4021200L, 3677783L, 3832060L, 2614900L, 1820332L, 3043750L),
ansicode = c(2410344L, 2390453L, 2411793L, 1214759L, 885360L,
2412035L, 485621L, 2403359L, 2412812L, 2583481L, 2390095L,
2397971L, 2396237L, 2585422L, 2411819L, 2405746L, 2398338L,
2394628L, 2812929L, 2586582L, 2408478L, 2582836L, 885393L,
2397270L, 2402898L, 2584453L, 2405811L, 2405518L, 2412737L,
2389752L, 2418574L, 2393549L, 2402559L, 2629970L, 2399453L,
2378109L, 2812999L, 2402563L, 2398956L, 2396699L, 2409759L,
2393028L, 2414061L, 2805542L, 2404192L, 2404475L, 2398514L,
2629884L, 2408486L, 2396265L, 2405306L, 2411363L, 2413515L,
2405064L, 2402989L, 2583899L, 2629103L, 2585016L, 2390487L,
2397481L, 2393811L, 2413298L, 2583927L, 2812702L, 2415078L,
1582764L, 2400116L, 2400036L, 2412013L, 2633665L, 2787908L,
2583158L, 2418866L, 1214943L, 2393998L, 485611L, 2398513L,
2394969L, 2806756L, 2397053L, 2406485L, 2395719L, 2399572L,
1267480L, 2389516L, 2410660L, 2409026L, 2806379L, 2584894L,
2404746L, 2586459L, 2396263L, 2411528L, 2398556L, 2412443L,
2584298L, 1036064L, 2806333L, 2396920L, 2804282L), city = c("donald",
"warminster heights", "san juan capistrano", "littlestown",
"port republic", "taylor", "merriam", "northlakes", "julesburg",
"california junction", "lower allen", "antwerp", "pleasantville",
"el rancho", "santa clarita", "willacoochee", "kennard",
"effingham", "la france", "beechwood", "keys", "orange grove mobile manor",
"shiloh", "west mineral", "stony point", "east salem", "heath",
"stamps", "haworth", "rio grande", "ester", "clayton", "hackberry",
"middle amana", "new baden", "melville", "rolland colony",
"hannahs mill", "germantown hills", "la fontaine", "aurora",
"green meadows", "kaiminani", "pinecroft", "dawson", "park city",
"hinsdale", "st. meinrad", "kingsland", "powhattan", "bowersville",
"palos verdes estates", "wyandotte", "meigs", "waverly",
"sunol", "arroyo hondo", "outlook", "west pocomoke", "buchtel",
"chatsworth", "smith village", "glenbrook", "rock spring",
"villalba", "bayfield", "waynesfield", "utica", "sunset",
"milford square", "lithium", "swall meadows", "unalaska",
"martinsburg", "ashland", "leawood", "hindsboro", "gray",
"turley", "trimble", "falcon", "linn", "olympia fields",
"mitchell", "mount pleasant mills", "greenville", "park city",
"arkabutla", "new river", "huntsville", "boulder junction",
"potwin", "red lick", "huey", "dougherty", "wadsworth", "grand forks",
"chassell", "edgewood", "lindsay"), lsad = c("25", "57",
"25", "21", "25", "25", "25", "57", "43", "57", "57", "47",
"25", "57", "25", "25", "47", "25", "57", "57", "57", "57",
"21", "25", "57", "57", "43", "25", "43", "57", "57", "25",
"57", "57", "47", "57", "57", "57", "47", "43", "25", "57",
"57", "57", "25", "25", "47", "57", "57", "25", "43", "25",
"43", "25", "57", "57", "57", "57", "57", "47", "25", "43",
"57", "57", "62", "25", "47", "47", "25", "57", "57", "57",
"25", "21", "25", "25", "47", "25", "57", "25", "43", "25",
"47", "25", "57", "25", "57", "57", "57", "25", "57", "25",
"25", "47", "43", "57", "25", "57", "43", "57"), funcstat = c("a",
"s", "a", "a", "a", "a", "a", "s", "a", "s", "s", "a", "a",
"s", "a", "a", "a", "a", "s", "s", "s", "s", "a", "a", "s",
"s", "a", "a", "a", "s", "s", "a", "s", "s", "a", "s", "s",
"s", "a", "a", "a", "s", "s", "s", "a", "a", "a", "s", "s",
"a", "a", "a", "a", "a", "s", "s", "s", "s", "s", "a", "a",
"a", "s", "s", "s", "a", "a", "a", "a", "s", "s", "s", "a",
"a", "a", "a", "a", "a", "s", "a", "a", "a", "a", "a", "s",
"a", "s", "s", "s", "a", "s", "a", "a", "a", "a", "s", "a",
"s", "a", "s"), latitude = c(45.221487, 40.18837, 33.500889,
39.74517, 39.534798, 30.573263, 39.017607, 35.780523, 40.984864,
41.56017, 40.226748, 41.180176, 41.387011, 36.220684, 34.414083,
31.335094, 41.474697, 39.120662, 34.616281, 32.336723, 35.802786,
32.598451, 39.462418, 37.283906, 35.867809, 40.608713, 31.344839,
33.354959, 33.840898, 39.019051, 64.879056, 39.736866, 29.964958,
41.794765, 38.536765, 41.559549, 44.3437, 32.937302, 40.768954,
40.673893, 33.055942, 39.867193, 19.757709, 40.564189, 31.771864,
37.093499, 41.800683, 38.168142, 30.666141, 39.761734, 34.373065,
33.774271, 36.807143, 31.071788, 27.985282, 41.154105, 36.534599,
46.331153, 38.096527, 39.463511, 42.916301, 35.45079, 39.100123,
34.81467, 18.127809, 46.81399, 40.602442, 40.895279, 41.13806,
40.433182, 37.831844, 37.50606, 53.910255, 40.310917, 38.812464,
38.907263, 39.684775, 41.841711, 36.736661, 39.476152, 35.194804,
38.478798, 41.521996, 43.730057, 40.724697, 33.111939, 45.630946,
34.700227, 37.142945, 34.782275, 46.1148, 37.938624, 33.485081,
38.605285, 34.399808, 42.821447, 47.921291, 47.036116, 40.103208,
47.224885), longitude = c(-122.837813, -75.084089, -117.654388,
-77.089213, -74.476099, -97.427116, -94.693955, -81.367835,
-102.262708, -95.994752, -76.902769, -84.736099, -93.272787,
-119.068357, -118.494729, -83.044003, -96.203696, -88.550859,
-82.770697, -90.808692, -94.941358, -114.660588, -75.29244,
-94.926801, -81.044121, -77.23694, -86.46905, -93.497879,
-94.657035, -74.87787, -148.041153, -100.176484, -93.410178,
-91.901539, -89.707193, -71.301933, -96.59226, -84.340945,
-89.462982, -85.722023, -97.509615, -83.945334, -156.001765,
-78.353464, -84.443499, -86.048077, -87.928172, -86.832128,
-98.454026, -95.634011, -83.084305, -118.425754, -94.729305,
-84.092683, -81.625304, -102.762746, -105.666602, -120.092812,
-75.579197, -82.180426, -96.514499, -97.457006, -119.927289,
-85.238869, -66.481897, -90.822546, -83.973881, -97.345349,
-112.028388, -75.405024, -89.88325, -118.642656, -166.529029,
-78.324286, -92.239531, -94.62524, -88.134729, -94.985863,
-107.792147, -94.561898, -78.65389, -91.844989, -87.691648,
-98.029974, -77.026451, -96.110256, -108.925311, -90.121565,
-80.595817, -86.532599, -89.654438, -97.01835, -94.161474,
-89.289973, -97.05148, -77.893875, -97.08933, -88.530745,
-85.737461, -105.152791), designation = c("city", "cdp",
"city", "borough", "city", "city", "city", "cdp", "town",
"cdp", "cdp", "village", "city", "cdp", "city", "city", "village",
"city", "cdp", "cdp", "cdp", "cdp", "borough", "city", "cdp",
"cdp", "town", "city", "town", "cdp", "cdp", "city", "cdp",
"cdp", "village", "cdp", "cdp", "cdp", "village", "town",
"city", "cdp", "cdp", "cdp", "city", "city", "village", "cdp",
"cdp", "city", "town", "city", "town", "city", "cdp", "cdp",
"cdp", "cdp", "cdp", "village", "city", "town", "cdp", "cdp",
"urbana", "city", "village", "village", "city", "cdp", "cdp",
"cdp", "city", "borough", "city", "city", "village", "city",
"cdp", "city", "town", "city", "village", "city", "cdp",
"city", "cdp", "cdp", "cdp", "city", "cdp", "city", "city",
"village", "town", "cdp", "city", "cdp", "town", "cdp")), row.names = c(22769L,
24845L, 3314L, 24015L, 17360L, 28139L, 10085L, 19881L, 3886L,
8750L, 24027L, 20585L, 9362L, 2499L, 3333L, 6041L, 16321L, 6847L,
25249L, 14051L, 22233L, 1210L, 17425L, 10353L, 20053L, 23545L,
253L, 1951L, 22166L, 17386L, 685L, 9771L, 11134L, 9225L, 7386L,
25001L, 25862L, 5663L, 6950L, 8239L, 26555L, 20991L, 6108L, 24388L,
5551L, 10772L, 7056L, 8470L, 27292L, 10202L, 5451L, 3116L, 22660L,
5776L, 5317L, 16546L, 17582L, 29958L, 12103L, 20709L, 8779L,
22515L, 16665L, 5902L, 31901L, 30658L, 21745L, 16574L, 28632L,
24127L, 15046L, 3455L, 930L, 24087L, 14494L, 10016L, 7055L, 8993L,
18048L, 15434L, 19615L, 15043L, 7454L, 25775L, 24194L, 27115L,
15857L, 14038L, 29305L, 276L, 30693L, 10201L, 27863L, 7075L,
22046L, 19267L, 20311L, 12502L, 8093L, 15798L), class = "data.frame")
I would like to calculate the distance between each entry in the city
column using the longitude
and latitude
columns and the gdist
function. I know that the following for
loop works and is easy to read:
dist_list <- list()
for (i in 1:nrow(somewhere)) {
dist_list[[i]] <- gdist(lon.1 = somewhere$longitude[i],
lat.1 = somewhere$latitude[i],
lon.2 = somewhere$longitude,
lat.2 = somewhere$latitude,
units="miles")
}
However: IT TAKES FOREVER to run on the full dataset (31K+ rows)--as in hours. I'm looking for something that will speed up this calculation. I figured something in the split-apply-combine
approach would work well, since I would like to eventually involve a pair of grouping variables, geo_block
and ansi_block
but honestly anything would be better than what I have.
I have tried the following:
somewhere$geo_block <- substr(somewhere$geoid, 1, 1)
somewhere$ansi_block <- substr(somewhere$ansicode, 1, 1)
somewhere <- somewhere %>%
split(.$geo_block, .$ansi_block) %>%
mutate(dist = gdist(longlat = somewhere[, c("longitude", "latitude")]))
But am unsure of how to specify the second set of long-lat inputs outside of the standard for
loop.
My question:
split-apply-combine
approach to solve this problem with geo_block
and ansi_block
as a grouping variable as above? I would like to return the shortest distance and the name of the city
and the value of geo_block
corresponding to this distance.All suggestions are welcome. Ideally the desired result would be fairly quick because the actual dataset I'm working with is quite large. Since I'm a bit in the woods here, I've added a bounty to the question to generate a little more interest and hopefully a wide set of potential answers that I can learn from. Thanks so much!
I'll show both the tidyverse and base R approaches to split-apply-combine. My understanding is that, for each city in each group of cities (whatever your grouping variables may be), you want to report the nearest within-group city and the distance to that nearest city.
First, a couple of remarks about Imap::gdist
:
lonlat
argument. You must use the arguments (lon|lat).(1|2)
to pass coordinates.while (abs(lamda - lamda.old) > 1e-11) {...}
, where the value of lamda
is (lon.1 - lon.2) * pi / 180
. For the loop condition to have length 1 (it should), the arguments lon.1
and lon.2
must have length 1.So, at least for now, I'll base my answer on the more canonical geosphere::distm
. It stores the entire n
-by-n
symmetric distance matrix in memory, so you may hit a memory allocation limit, but that is much less likely to happen within a split-apply-combine, where n
is the within-group (rather than total) number of cities.
I'll be working with this part of your data frame:
library("dplyr")
dd <- somewhere %>%
as_tibble() %>%
mutate(geo_block = as.factor(as.integer(substr(geoid, 1L, 1L))),
ansi_block = as.factor(as.integer(substr(ansicode, 1L, 1L)))) %>%
select(geo_block, ansi_block, city, longitude, latitude)
dd
# # A tibble: 100 × 5
# geo_block ansi_block city longitude latitude
# <fct> <fct> <chr> <dbl> <dbl>
# 1 4 2 donald -123. 45.2
# 2 4 2 warminster heights -75.1 40.2
# 3 6 2 san juan capistrano -118. 33.5
# 4 4 1 littlestown -77.1 39.7
# 5 3 8 port republic -74.5 39.5
# 6 4 2 taylor -97.4 30.6
# 7 2 4 merriam -94.7 39.0
# 8 3 2 northlakes -81.4 35.8
# 9 8 2 julesburg -102. 41.0
# 10 1 2 california junction -96.0 41.6
# # … with 90 more rows
The following function find_nearest
performs the basic task of identifying nearest neighbours given a set of n
points in a d
-dimensional metric space. Its arguments are as follows:
data
: A data frame with n
rows (one per point), d
variables
specifying point coordinates, and 1 or more ID variables to be
associated with each point.dist
: A function taking an n
-by-d
matrix of coordinates as
an argument and returning a corresponding n
-by-n
symmetric
distance matrix.coordvar
: A character vector listing coordinate variable names.idvar
: A character vector listing ID variable names.In our data frame, the coordinate variables are longitude
and latitude
and there is one ID variable city
. For simplicity, I have defined the default values of coordvar
and idvar
accordingly.
find_nearest <- function(data, dist,
coordvar = c("longitude", "latitude"),
idvar = "city") {
m <- match(coordvar, names(data), 0L)
n <- nrow(data)
if (n < 2L) {
argmin <- NA_integer_[n]
distance <- NA_real_[n]
} else {
## Compute distance matrix
D <- dist(as.matrix(data[m]))
## Extract minimum distances
diag(D) <- Inf # want off-diagonal distances
argmin <- apply(D, 2L, which.min)
distance <- D[cbind(argmin, seq_len(n))]
}
## Return focal point data, nearest neighbour ID, distance
r1 <- data[-m]
r2 <- data[argmin, idvar, drop = FALSE]
names(r2) <- paste0(idvar, "_nearest")
data.frame(r1, r2, distance, row.names = NULL, stringsAsFactors = FALSE)
}
Here is the output of find_nearest
applied to our data frame, without grouping, with distances computed by the Vincenty ellipsoid method.
dist_vel <- function(x) {
geosphere::distm(x, fun = geosphere::distVincentyEllipsoid)
}
res <- find_nearest(dd, dist = dist_vel)
head(res, 10L)
# geo_block ansi_block city city_nearest distance
# 1 4 2 donald outlook 246536.39
# 2 4 2 warminster heights milford square 38512.79
# 3 6 2 san juan capistrano palos verdes estates 77722.35
# 4 4 1 littlestown lower allen 55792.53
# 5 3 8 port republic rio grande 66935.70
# 6 4 2 taylor kingsland 98997.90
# 7 2 4 merriam leawood 13620.87
# 8 3 2 northlakes stony point 30813.46
# 9 8 2 julesburg sunol 46037.81
# 10 1 2 california junction kennard 19857.41
Here is the tidyverse split-apply-combine, grouping on geo_block
and ansi_block
:
dd %>%
group_by(geo_block, ansi_block) %>%
group_modify(~find_nearest(., dist = dist_vel))
# # A tibble: 100 × 5
# # Groups: geo_block, ansi_block [13]
# geo_block ansi_block city city_nearest distance
# <fct> <fct> <chr> <chr> <dbl>
# 1 1 2 california junction gray 89610.
# 2 1 2 pleasantville middle amana 122974.
# 3 1 2 willacoochee meigs 104116.
# 4 1 2 effingham hindsboro 72160.
# 5 1 2 heath dawson 198052.
# 6 1 2 middle amana pleasantville 122974.
# 7 1 2 new baden huey 37147.
# 8 1 2 hannahs mill dawson 129599.
# 9 1 2 germantown hills hindsboro 165140.
# 10 1 2 la fontaine edgewood 63384.
# # … with 90 more rows
Note, for example, how the city considered nearest to California Junction changes from Kennard to Gray, which is farther away, under this grouping.
Here is the base R split-apply-combine, built on base function by
. It is equivalent except for the ordering of groups in the result:
find_nearest_by <- function(data, by, ...) {
do.call(rbind, base::by(data, by, find_nearest, ...))
}
res <- find_nearest_by(dd, by = dd[c("geo_block", "ansi_block")], dist = dist_vel)
head(res, 10L)
# geo_block ansi_block city city_nearest distance
# 1 3 1 grand forks <NA> NA
# 2 4 1 littlestown martinsburg 122718.95
# 3 4 1 martinsburg littlestown 122718.95
# 4 4 1 mitchell martinsburg 1671365.58
# 5 5 1 bayfield <NA> NA
# 6 1 2 california junction gray 89609.71
# 7 1 2 pleasantville middle amana 122974.32
# 8 1 2 willacoochee meigs 104116.21
# 9 1 2 effingham hindsboro 72160.43
# 10 1 2 heath dawson 198051.76
This ordering reveals that find_nearest
returns NA
for groups containing only one city. We would have seen these NA
in the tidyverse result had we printed the entire tibble.
FWIW, here is a benchmark of the various functions implemented in geosphere
for computing geodesic distances, saying nothing about accuracy, though you can speculate from the results that the Vincenty ellipsoid distance cuts the fewest corners (haha)...
fn <- function(dist) find_nearest(dd, dist = dist)
library("geosphere")
dist_geo <- function(x) distm(x, fun = distGeo)
dist_cos <- function(x) distm(x, fun = distCosine)
dist_hav <- function(x) distm(x, fun = distHaversine)
dist_vsp <- function(x) distm(x, fun = distVincentySphere)
dist_vel <- function(x) distm(x, fun = distVincentyEllipsoid)
dist_mee <- function(x) distm(x, fun = distMeeus)
microbenchmark::microbenchmark(
fn(dist_geo),
fn(dist_cos),
fn(dist_hav),
fn(dist_vsp),
fn(dist_vel),
fn(dist_mee),
times = 1000L
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# fn(dist_geo) 6.143276 6.291737 6.718329 6.362257 6.459345 45.91131 1000
# fn(dist_cos) 4.239236 4.399977 4.918079 4.461804 4.572033 45.70233 1000
# fn(dist_hav) 4.005331 4.156067 4.641016 4.210721 4.307542 41.91619 1000
# fn(dist_vsp) 3.827227 3.979829 4.446428 4.033621 4.123924 44.29160 1000
# fn(dist_vel) 129.712069 132.549638 135.006170 133.935479 135.248135 174.88874 1000
# fn(dist_mee) 3.716814 3.830999 4.234231 3.883582 3.962712 42.12947 1000
Imap::gdist
seems to be a pure R implementation of Vincenty ellipsoid distance. That may account for its slowness...
Some final remarks:
dist
argument of find_nearest
need not be built on one of the geosphere
distances. You can pass any function you want satisfying the constraints that I state above.find_nearest
could be adapted to accept functions dist
returning objects of class "dist"
. These objects store just the n*(n-1)/2
lower triangular elements of the n
-by-n
symmetric distance matrix (see ?dist
). This would improve support for large n
and make find_nearest
compatible with this memory-efficient implementation of the Haversine distance (suggested by @dww in the comments). It would just need to be worked out how to translate an operation like apply(D, 2L, which.min)
. [Update: I have implemented this change to find_nearest
in a separate answer, where I provide new benchmarks.]