I am attempting to calculate summary statistics for the the administrative units within 5km of every administrative in a shapefile of Jamaica. I use dnearneigh() to find which administrative units these are, but then I am not sure how best to work with the list I get as an output to calculate summary statistics. I tried using spatial subsetting but that has not worked, so advice on how best to carry out this operation would be useful.
shp_centroid <- st_point_on_surface(sf_communities)
shp_centroid_coord <- st_coordinates(shp_centroid)
shp_dist <- dnearneigh(shp_centroid_coord,0,5000)
subset <- sf_communities[unlist(shp_dist),]
One option is to go with nested tibbles / data.frames where every administrative unit has its own tibble of neighbours + itself. Thanks to rowwise operations in dplyr
, collecting summary statistics from such structures is quite convenient. There's always an option to use unnest()
to lengthen that nested dataset and go with group_by()
/summarise()
instead.
The following example uses nc
dataset from sf
package, distance threshold is set to 50km to better match those shape sizes. Instead of spdep
package, it just uses sf
functions, meaning that the list we'll work with (sgbp
- sparse geometry binary predicate lists, standard sf
stuff) might have a slightly different structure. Here the neighbourhood is defined as an intersection of 50km buffer and county polygons, not county centroids as described in the Question title; so this might be another detail to change / review.
library(sf)
#> Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(dplyr, warn.conflicts = FALSE)
library(purrr)
library(tidyr)
library(ggplot2)
# example data from sf package examples
nc = read_sf(system.file("shape/nc.shp", package="sf")) %>%
select(NAME, AREA, starts_with("BIR"))
nc
#> Simple feature collection with 100 features and 4 fields
#> Geometry type: MULTIPOLYGON
#> Dimension: XY
#> Bounding box: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
#> Geodetic CRS: NAD27
#> # A tibble: 100 × 5
#> NAME AREA BIR74 BIR79 geometry
#> <chr> <dbl> <dbl> <dbl> <MULTIPOLYGON [°]>
#> 1 Ashe 0.114 1091 1364 (((-81.47276 36.23436, -81.54084 36.27251, -81…
#> 2 Alleghany 0.061 487 542 (((-81.23989 36.36536, -81.24069 36.37942, -81…
#> 3 Surry 0.143 3188 3616 (((-80.45634 36.24256, -80.47639 36.25473, -80…
#> 4 Currituck 0.07 508 830 (((-76.00897 36.3196, -76.01735 36.33773, -76.…
#> 5 Northampton 0.153 1421 1606 (((-77.21767 36.24098, -77.23461 36.2146, -77.…
#> 6 Hertford 0.097 1452 1838 (((-76.74506 36.23392, -76.98069 36.23024, -76…
#> 7 Camden 0.062 286 350 (((-76.00897 36.3196, -75.95718 36.19377, -75.…
#> 8 Gates 0.091 420 594 (((-76.56251 36.34057, -76.60424 36.31498, -76…
#> 9 Warren 0.118 968 1190 (((-78.30876 36.26004, -78.28293 36.29188, -78…
#> 10 Stokes 0.124 1612 2038 (((-80.02567 36.25023, -80.45301 36.25709, -80…
#> # ℹ 90 more rows
# test and visualize neighbourhood subsetting for a single county (Lee)
lee = tibble::lst(
polygon = nc[nc$NAME == "Lee","geometry"],
centroid = st_centroid(polygon),
buffer = st_buffer(centroid, 50000),
within = st_filter(nc, buffer)
)
ggplot() +
geom_sf(data = nc) +
geom_sf(data = lee$within, fill = "green") +
geom_sf(data = lee$polygon, fill = "red") +
geom_sf(data = lee$buffer, alpha = .6, fill = "gold") +
geom_sf(data = lee$centroid, size = 1)
# resulting within_dist type is sgbp, "sparse geometry binary predicate lists",
# a list of sf object row indeces that intersect with the buffer
nc <- nc %>%
mutate(within_dist = st_centroid(geometry) %>%
st_buffer(50000) %>%
st_intersects(geometry)
, .before = "geometry")
nc
#> Simple feature collection with 100 features and 5 fields
#> Geometry type: MULTIPOLYGON
#> Dimension: XY
#> Bounding box: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
#> Geodetic CRS: NAD27
#> # A tibble: 100 × 6
#> NAME AREA BIR74 BIR79 within_dist geometry
#> * <chr> <dbl> <dbl> <dbl> <sgbp[,100]> <MULTIPOLYGON [°]>
#> 1 Ashe 0.114 1091 1364 <int [7]> (((-81.47276 36.23436, -81.54084 …
#> 2 Alleghany 0.061 487 542 <int [7]> (((-81.23989 36.36536, -81.24069 …
#> 3 Surry 0.143 3188 3616 <int [9]> (((-80.45634 36.24256, -80.47639 …
#> 4 Currituck 0.07 508 830 <int [8]> (((-76.00897 36.3196, -76.01735 3…
#> 5 Northampton 0.153 1421 1606 <int [9]> (((-77.21767 36.24098, -77.23461 …
#> 6 Hertford 0.097 1452 1838 <int [10]> (((-76.74506 36.23392, -76.98069 …
#> 7 Camden 0.062 286 350 <int [11]> (((-76.00897 36.3196, -75.95718 3…
#> 8 Gates 0.091 420 594 <int [9]> (((-76.56251 36.34057, -76.60424 …
#> 9 Warren 0.118 968 1190 <int [8]> (((-78.30876 36.26004, -78.28293 …
#> 10 Stokes 0.124 1612 2038 <int [8]> (((-80.02567 36.25023, -80.45301 …
#> # ℹ 90 more rows
# within_dist for Lee:
nc$within_dist[nc$NAME == "Lee"]
#> [[1]]
#> [1] 27 29 30 37 47 48 54 60 63 67 82 86
# resolved to NAMEs :
nc$NAME[nc$within_dist[nc$NAME == "Lee"][[1]]]
#> [1] "Alamance" "Orange" "Durham" "Wake" "Randolph"
#> [6] "Chatham" "Johnston" "Lee" "Harnett" "Moore"
#> [11] "Cumberland" "Hoke"
# ignore geometries for now for more compact output
nc_df <- st_drop_geometry(nc)
# build a nested tibble, each county row gets a tibble of neighbours (nb),
# use rowwise grouping to subset nc_df with within_dist of current row
nc_nested <- nc_df %>%
rowwise() %>%
mutate(
nb_idx = paste(within_dist, collapse = ","),
nb_names = paste(nc_df[["NAME"]][within_dist], collapse = ","),
nb = (nc_df[within_dist, ]) %>% select(-within_dist) %>% list()) %>%
ungroup()
nc_nested
#> # A tibble: 100 × 8
#> NAME AREA BIR74 BIR79 within_dist nb_idx nb_names nb
#> <chr> <dbl> <dbl> <dbl> <sgbp[,100]> <chr> <chr> <list>
#> 1 Ashe 0.114 1091 1364 <int [7]> 1,2,3,18,19,22,… Ashe,Al… <tibble>
#> 2 Alleghany 0.061 487 542 <int [7]> 1,2,3,18,19,23,… Ashe,Al… <tibble>
#> 3 Surry 0.143 3188 3616 <int [9]> 1,2,3,10,18,23,… Ashe,Al… <tibble>
#> 4 Currituck 0.07 508 830 <int [8]> 4,7,8,17,20,21,… Curritu… <tibble>
#> 5 Northampton 0.153 1421 1606 <int [9]> 5,6,8,9,16,28,3… Northam… <tibble>
#> 6 Hertford 0.097 1452 1838 <int [10]> 5,6,7,8,16,17,2… Northam… <tibble>
#> 7 Camden 0.062 286 350 <int [11]> 4,6,7,8,17,20,2… Curritu… <tibble>
#> 8 Gates 0.091 420 594 <int [9]> 4,5,6,7,8,17,20… Curritu… <tibble>
#> 9 Warren 0.118 968 1190 <int [8]> 5,9,13,15,16,24… Northam… <tibble>
#> 10 Stokes 0.124 1612 2038 <int [8]> 3,10,12,23,25,2… Surry,S… <tibble>
#> # ℹ 90 more rows
# we can now calculate summary statistics by accessing nested tibbles in
# nb column with purrr::map*() or lapply();
# or though rowwise grouping:
nc_nested %>%
select(NAME, nb_names, nb) %>%
rowwise() %>%
mutate(nb_bir74_mean = mean(nb$BIR74),
nb_bir74_sum = sum(nb$BIR74),
nb_area_sum = sum(nb$AREA))
#> # A tibble: 100 × 6
#> # Rowwise:
#> NAME nb_names nb nb_bir74_mean nb_bir74_sum nb_area_sum
#> <chr> <chr> <list> <dbl> <dbl> <dbl>
#> 1 Ashe Ashe,Alleghany,S… <tibble> 1946. 13625 0.784
#> 2 Alleghany Ashe,Alleghany,S… <tibble> 2092. 14643 0.839
#> 3 Surry Ashe,Alleghany,S… <tibble> 3111. 27997 1.06
#> 4 Currituck Currituck,Camden… <tibble> 607 4856 0.576
#> 5 Northampton Northampton,Hert… <tibble> 2047. 18420 1.22
#> 6 Hertford Northampton,Hert… <tibble> 1293. 12933 1.05
#> 7 Camden Currituck,Hertfo… <tibble> 784. 8622 0.953
#> 8 Gates Currituck,Northa… <tibble> 920. 8284 0.813
#> 9 Warren Northampton,Warr… <tibble> 2366. 18925 1.08
#> 10 Stokes Surry,Stokes,Roc… <tibble> 5660. 45276 0.998
#> # ℹ 90 more rows
# or we could unnest nb column to lenghten our dataset ...
nc_unnested <- nc_nested %>%
unnest(nb, names_sep = ".") %>%
select(-(within_dist:nb_names))
print(nc_unnested, n = 14)
#> # A tibble: 1,004 × 8
#> NAME AREA BIR74 BIR79 nb.NAME nb.AREA nb.BIR74 nb.BIR79
#> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 Ashe 0.114 1091 1364 Ashe 0.114 1091 1364
#> 2 Ashe 0.114 1091 1364 Alleghany 0.061 487 542
#> 3 Ashe 0.114 1091 1364 Surry 0.143 3188 3616
#> 4 Ashe 0.114 1091 1364 Wilkes 0.199 3146 3725
#> 5 Ashe 0.114 1091 1364 Watauga 0.081 1323 1775
#> 6 Ashe 0.114 1091 1364 Avery 0.064 781 977
#> 7 Ashe 0.114 1091 1364 Caldwell 0.122 3609 4249
#> 8 Alleghany 0.061 487 542 Ashe 0.114 1091 1364
#> 9 Alleghany 0.061 487 542 Alleghany 0.061 487 542
#> 10 Alleghany 0.061 487 542 Surry 0.143 3188 3616
#> 11 Alleghany 0.061 487 542 Wilkes 0.199 3146 3725
#> 12 Alleghany 0.061 487 542 Watauga 0.081 1323 1775
#> 13 Alleghany 0.061 487 542 Yadkin 0.086 1269 1568
#> 14 Alleghany 0.061 487 542 Iredell 0.155 4139 5400
#> # ℹ 990 more rows
# ... and use group_by() / summarise() or just summarise(..., .by):
nc_unnested %>%
summarise(nb_bir74_mean = mean(nb.BIR74),
nb_bir74_sum = sum(nb.BIR74),
nb_area_sum = sum(nb.AREA), .by = NAME)
#> # A tibble: 100 × 4
#> NAME nb_bir74_mean nb_bir74_sum nb_area_sum
#> <chr> <dbl> <dbl> <dbl>
#> 1 Ashe 1946. 13625 0.784
#> 2 Alleghany 2092. 14643 0.839
#> 3 Surry 3111. 27997 1.06
#> 4 Currituck 607 4856 0.576
#> 5 Northampton 2047. 18420 1.22
#> 6 Hertford 1293. 12933 1.05
#> 7 Camden 784. 8622 0.953
#> 8 Gates 920. 8284 0.813
#> 9 Warren 2366. 18925 1.08
#> 10 Stokes 5660. 45276 0.998
#> # ℹ 90 more rows
Created on 2023-08-01 with reprex v2.0.2