rr-sf

Intersecting shapefiles in R


I have these 2 shapefiles from the internet:

library(sf)
library(dplyr)
library(fs)

url_1 <- "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lfsa000b21a_e.zip"

temp_dir <- tempdir()

zip_file <- file.path(temp_dir, "shapefile.zip")

download.file(url_1, destfile = zip_file)

exdir_ <- tempfile(pattern = "shp_1")

unzip(zip_file, exdir = exdir_, junkpaths = TRUE)

fs::dir_tree(exdir_)

file_1 <- st_read(exdir_)

fsa <- st_transform(file_1, "+proj=longlat +datum=WGS84")
unlink(temp_dir, recursive = TRUE)

url_2 <- "https://www12.statcan.gc.ca/census-recensement/alternative_alternatif.cfm?l=eng&dispext=zip&teng=lada000b21a_e.zip&k=%20%20%20151162&loc=//www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000b21a_e.zip"


download.file(url_2, destfile = "lada000b21a_e.zip")

unzip("lada000b21a_e.zip")

ada <- st_read("lada000b21a_e.shp")

For the same PRUID, I want to make a matrix which contains what % each polygon from shapefile 1 intersects with each polygon from shapefile 2.

library(sf)
library(dplyr)


calculate_intersection_percentages <- function(ada, fsa, pruid) {
  ada_filtered <- ada %>% filter(PRUID == pruid)
  fsa_filtered <- fsa %>% filter(PRUID == pruid)
  
  ada_filtered <- st_transform(ada_filtered, st_crs(fsa_filtered))
  
  results <- list()
  
  total_fsas <- nrow(fsa_filtered)
  completed_fsas <- 0
  
  for (ada_id in ada_filtered$ADAUID) {
    ada_single <- ada_filtered %>% filter(ADAUID == ada_id)
    ada_area <- st_area(ada_single)
    
    for (fsa_id in fsa_filtered$CFSAUID) {
      fsa_single <- fsa_filtered %>% filter(CFSAUID == fsa_id)
      
      intersection <- st_intersection(ada_single, fsa_single)
      
      if (length(intersection) > 0) {
        intersection_area <- st_area(intersection)
        percentage <- as.numeric(intersection_area / ada_area * 100)
        
        print(sprintf("ADA: %s, FSA: %s, Intersection = %.2f%%", ada_id, fsa_id, percentage))
        
        results[[ada_id]][[fsa_id]] <- percentage
      }
      
      completed_fsas <- completed_fsas + 1
      progress_percentage <- (completed_fsas / (total_fsas * nrow(ada_filtered))) * 100
      print(sprintf("Progress: %.2f%% of FSA calculations completed", progress_percentage))
    }
  }
  
  result_matrix <- do.call(rbind, lapply(results, function(x) {
    unlist(x, use.names = TRUE)
  }))
  
  result_matrix[is.na(result_matrix)] <- 0
  
  result_matrix <- result_matrix / rowSums(result_matrix) * 100
  
  return(result_matrix)
}

I launch the function like this:

result <- calculate_intersection_percentages(ada, fsa, pruid = "10")

The code runs, but I am getting some very exact numbers which makes me think the intersection is not happening correctly:

20.00000000 20.00000000 20.00000000 20.00000000 20.00000000

Also, there are 35 FSA's in Newfoundland (https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_A#:~:text=and%20some%20libraries.-,Newfoundland%20and%20Labrador,35%20FSAs%20in%20this%20list.), but I am only seeing 5:

> head(result)
              A0A      A1B      A1C      A1E      A1S
10010001 15.50990 26.73515 15.50990 26.73515 15.50990
10010002 23.85278 14.22083 23.85278 14.22083 23.85278
10010003 13.88354 29.17468 13.88354 29.17468 13.88354
10010004 20.84652 18.73022 20.84652 18.73022 20.84652
10010005 26.08074 10.87889 26.08074 10.87889 26.08074
10010006 20.00000 20.00000 20.00000 20.00000 20.00000

How can I correctly intersect these two shapefiles?


Solution

  • Just few interesting issues to start with, there might be more.

    lobstr::tree(results)
    <list>
    ├─10010001: <list>
    │ ├─A0A: 36.6844068668399
    │ ├─A0B: 63.3155931332246
    │ ├─A0C<dbl [0]>: 
    │ ├─A0E<dbl [0]>: 
    │ └─A0G<dbl [0]>: 
    ├─10010002: <list>
    │ ├─A0A: 62.6409032955523
    │ ├─A0B<dbl [0]>: 
    │ ├─A0C<dbl [0]>: 
    │ ├─A0E<dbl [0]>: 
    │ └─A0G<dbl [0]>: 
    ├─10010003: <list>
    │ ├─A0A: 32.2477921284301
    │ ├─A0B: 0
    │ ├─A0C<dbl [0]>: 
    │ ├─A0E<dbl [0]>: 
    │ └─A0G<dbl [0]>: 
    ├─10010004: <list>
    │ ├─A0A<dbl [0]>: 
    │ ├─A0B<dbl [0]>: 
    │ ├─A0C<dbl [0]>: 
    │ ├─A0E<dbl [0]>: 
    │ └─A0G<dbl [0]>: 
    └─10010005: <list>
      ├─A0A: 0
      ├─A0B<dbl [0]>: 
      ├─A0C<dbl [0]>: 
      ├─A0E<dbl [0]>: 
      └─A0G<dbl [0]>: 
    
    lapply(results, unlist, use.names = TRUE) |> lobstr::tree(show_attributes = TRUE)
    <list>
    ├─10010001<dbl [2]>: 36.6844068668399, 63.3155931332246
    │ └┄attr(,"names")<chr [2]>: "A0A", "A0B"
    ├─10010002: 62.6409032955523
    │ └┄attr(,"names"): "A0A"
    ├─10010003<dbl [2]>: 32.2477921284301, 0
    │ └┄attr(,"names")<chr [2]>: "A0A", "A0B"
    ├─10010004<dbl [0]>: 
    ├─10010005: 0
    ┊ └┄attr(,"names"): "A0A"
    └┄attr(,"names")<chr [5]>: "10010001", "10010002", "10010003", "10010004", "10010005"
    
    lapply(results, unlist, use.names = TRUE) |> do.call(what = rbind)
                  A0A      A0B
    10010001 36.68441 63.31559
    10010002 62.64090 62.64090
    10010003 32.24779  0.00000
    10010005  0.00000  0.00000
    

    To continue with a possible solution, it seems that a frame from st_intersection() pivoted wider should be quite close to what you are after.

    Fetch data and load just a small subsets for testing:

    library(sf)
    #> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
    library(dplyr)
    library(tidyr)
    library(ggplot2)
    
    # curl::curl_download(
    #   "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lfsa000b21a_e.zip", 
    #   "lfsa000b21a_e.zip",
    #   quiet = FALSE)
    # 
    # curl::curl_download(
    #   "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000b21a_e.zip", 
    #   "lada000b21a_e.zip",
    #   quiet = FALSE)
    
    # check layers
    rbind(
      st_layers("/vsizip/lfsa000b21a_e.zip/lfsa000b21a_e/"),
      st_layers("/vsizip/lada000b21a_e.zip")
    )
    #> Driver: ESRI Shapefile 
    #> Driver: ESRI Shapefile 
    #> Available layers:
    #>      layer_name geometry_type features fields                          crs_name
    #> 1 lfsa000b21a_e       Polygon     1643      5 NAD83 / Statistics Canada Lambert
    #> 2 lada000b21a_e       Polygon     5433      4 NAD83 / Statistics Canada Lambert
    
    # read relevant subsets for testing
    fsa <- read_sf(
      "/vsizip/lfsa000b21a_e.zip/lfsa000b21a_e/", 
      query = "SELECT CFSAUID, PRUID FROM lfsa000b21a_e WHERE PRUID = '10'",
      quiet = TRUE)
    nrow(fsa)
    #> [1] 35
    
    ada <- read_sf(
      "/vsizip/lada000b21a_e.zip", 
      query = "SELECT ADAUID, PRUID FROM lada000b21a_e WHERE PRUID = '10'",
      quiet = TRUE)
    nrow(ada)
    #> [1] 86
    

    Area ratios of intersections, feel free to adjust scaling:

    isect_area_ratio <- function(ada, fsa, pruid) {
      fsa_filtered <- 
        fsa |> 
        filter(PRUID == pruid) |> 
        select(CFSAUID)
    
      # add `area_ada` so it would be included in st_intersection() result
      ada_filtered <- 
        ada |> 
        filter(PRUID == pruid) |> 
        select(ADAUID) |>   
        st_transform(st_crs(fsa_filtered)) |> 
        mutate(area_ada = st_area(geometry)) 
    
      # intersection also includes geometries without area (points, linestrings, ..), 
      # leave those out;
      # once we have ratios, we can drop geometries, pivot from long to wide and
      # convert to matrix
      area_ratio_m <- 
        st_intersection(ada_filtered, fsa_filtered) |> 
        filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON", "GEOMETRYCOLLECTION")) |> 
        mutate(area_ratio = (st_area(geometry) / area_ada) |> as.numeric()) |> 
        st_drop_geometry() |> 
        select(ADAUID, CFSAUID, area_ratio) |> 
        pivot_wider(names_from = CFSAUID, values_from = area_ratio, values_fill = 0) |> 
        tibble::column_to_rownames("ADAUID") |> 
        as.matrix()
      
      # make sure matrix row and column order aligns with row order of input dataframes
      area_ratio_m[ada_filtered$ADAUID, fsa_filtered$CFSAUID ]
    }
    

    Test:

    tictoc::tic("isect_area_ratio")
    m <- isect_area_ratio(ada, fsa, pruid = "10")
    tictoc::toc()
    #> isect_area_ratio: 9.14 sec elapsed
    
    # expect 86x35 matrix
    str(m)
    #>  num [1:86, 1:35] 0.367 0.626 0.322 0 0 ...
    #>  - attr(*, "dimnames")=List of 2
    #>   ..$ : chr [1:86] "10010001" "10010002" "10010003" "10010004" ...
    #>   ..$ : chr [1:35] "A0A" "A0B" "A0C" "A0E" ...
    
    # upper left corner
    m[1:10, 1:10]
    #>                A0A       A0B A0C A0E A0G A0H A0J A0K A0L A0M
    #> 10010001 0.3668441 0.6331559   0   0   0   0   0   0   0   0
    #> 10010002 0.6264090 0.0000000   0   0   0   0   0   0   0   0
    #> 10010003 0.3224779 0.0000000   0   0   0   0   0   0   0   0
    #> 10010004 0.0000000 0.0000000   0   0   0   0   0   0   0   0
    #> 10010005 0.0000000 0.0000000   0   0   0   0   0   0   0   0
    #> 10010006 0.0000000 0.0000000   0   0   0   0   0   0   0   0
    #> 10010007 0.0000000 0.0000000   0   0   0   0   0   0   0   0
    #> 10010008 0.0000000 0.0000000   0   0   0   0   0   0   0   0
    #> 10010009 0.0000000 0.0000000   0   0   0   0   0   0   0   0
    #> 10010010 0.0000000 0.0000000   0   0   0   0   0   0   0   0
    
    # check if rows add up to ~1
    rowSums(m) |> summary() 
    #>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    #>       1       1       1       1       1       1
    

    A bit closer look at a small subset of intersections, note how we end up with geometry collections and multipolygons, one for every intersecting geometry pair. Number of individual intersection polygons can be much higher.

    ada |> 
      filter(ADAUID == "10010032") |> 
      mutate(area_ada = st_area(geometry)) |> 
      st_intersection(fsa) |> 
      filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON", "GEOMETRYCOLLECTION")) |> 
      mutate(area_ratio = (st_area(geometry) / area_ada) |> as.numeric()) |> 
      print() |> 
      ggplot() +
      geom_sf(data = ada[ada$ADAUID == "10010032","geometry"]) +
      geom_sf(aes(fill = CFSAUID), legend = FALSE) +
      geom_sf_label(aes(label = scales::label_percent()(area_ratio))) +
      facet_grid(rows = vars(ADAUID), cols = vars(CFSAUID)) +
      guides(fill = "none")
    #> Warning: attribute variables are assumed to be spatially constant throughout
    #> all geometries
    #> Simple feature collection with 2 features and 6 fields
    #> Geometry type: GEOMETRY
    #> Dimension:     XY
    #> Bounding box:  xmin: 8952734 ymin: 2027053 xmax: 9015751 ymax: 2119991
    #> Projected CRS: NAD83 / Statistics Canada Lambert
    #> # A tibble: 2 × 7
    #>   ADAUID   PRUID   area_ada CFSAUID PRUID.1                  geometry area_ratio
    #> * <chr>    <chr>      [m^2] <chr>   <chr>              <GEOMETRY [m]>      <dbl>
    #> 1 10010032 10        3.49e9 A0A     10      MULTIPOLYGON (((8991045 …      0.890
    #> 2 10010032 10        3.49e9 A0B     10      GEOMETRYCOLLECTION (POLY…      0.110
    

    Created on 2024-09-23 with reprex v2.1.1