rspatialr-sf

Assign observation coordinates into multiple polygons


I have a dataframe with observations made at lat/lon coordinates. I have three polygons that span broad areas within which the observations were made. I want to create a grouping variable in a new column, that contains a short letter code for the polygon within which the observation was made. Here is my sample data:

> dput(testdata)
structure(list(lon = c(-79.30315, -79.29561, -79.29572, -79.29833, 
-79.29603, -79.29659, -79.29097, -79.29347, -79.29347, -79.294, 
-79.29644, -79.29653, -79.27027, -79.27656, -79.26938, -79.28683, 
-79.29256, -79.28848, -79.29256, -79.29097, -79.292277, -79.29311, 
-79.29259, -79.292277, -79.29654, -79.29123, -79.3037, -79.28743, 
-79.29641, -79.29661, -79.29295, -79.29101, -79.2951, -79.29652, 
-79.29035, -79.29183, -79.29661, -79.30317, -79.28734, -79.28734, 
-79.296509, -79.24673, -79.24699, -79.24689, -79.24693, -79.26633, 
-79.2585, -79.25765, -79.27642, -79.2754, -79.28565, -79.29443, 
-79.28264, -79.24703, -79.24706, -79.24706, -79.247, -79.24863, 
-79.30019, -79.3023, -79.30154, -79.24672, -79.24672, -79.24672, 
-79.2969, -79.25789, -79.26389, -79.26274, -79.26976, -79.29689, 
-79.26999, -79.29676, -79.29634, -79.29671, -79.29676, -79.29739, 
-79.30154, -79.28073, -79.28101, -79.28164, -79.28447, -79.28244, 
-79.28674, -79.28123, -79.28515, -79.29752, -79.29449, -79.30283, 
-79.26671, -79.29492, -79.24765, -79.23946, -79.24728, -79.24826, 
-79.2477), lat = c(25.69511, 25.69913, 25.69909, 25.69808, 25.69791, 
25.6987, 25.6976, 25.69887, 25.69887, 25.69812, 25.698905, 25.69876, 
25.69545, 25.69664, 25.69499, 25.69707, 25.69877, 25.69738, 25.69877, 
25.69786, 25.698363, 25.69876, 25.69884, 25.698363, 25.699164, 
25.69808, 25.69149, 25.69682, 25.69919, 25.69897, 25.6987, 25.69785, 
25.69937, 25.69917, 25.69759, 25.69846, 25.69897, 25.69231, 25.69592, 
25.69592, 25.699178, 25.73256, 25.73151, 25.73248, 25.7323, 25.73618, 
25.75886, 25.76018, 25.69645, 25.69439, 25.69597, 25.69989, 25.69769, 
25.73171, 25.73161, 25.73156, 25.23161, 25.73258, 25.69319, 25.6926, 
25.69706, 25.73245, 25.73245, 25.73245, 25.70004, 25.77368, 25.7471, 
25.72693, 25.72724, 25.6997, 25.72725, 25.69891, 25.69924, 25.69885, 
25.69891, 25.69776, 25.69712, 25.72679, 25.72655, 25.73001, 25.72484, 
25.72546, 25.72225, 25.72625, 25.72549, 25.69781, 25.69989, 25.69358, 
25.69701, 25.69955, 25.73175, 25.73411, 25.73237, 25.73462, 25.73189
)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 
12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 40L, 41L, 42L, 43L, 
44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 
57L, 58L, 59L, 60L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 
74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L, 
87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 
100L, 135L, 136L, 137L, 138L, 139L, 140L, 141L, 142L, 143L, 144L, 
145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L), class = "data.frame")

These are my three polygons:

#POL1
coords1 <- matrix(c(-79.0877, 25.7213, 
                     -79.32764, 25.69931, 
                     -79.32665, 25.20054, 
                     -79.04101, 25.21296),
                   ncol = 2, byrow = T)

coords1 <- rbind(coords1, coords1[1, ]) # Ensure the polygon is closed by repeating the first coordinate at the end

### Create polygon
pol_1 <- sf::st_sfc(sf::st_polygon(list(coords1)), crs = 4326)

# POL2
## Define pol boundaries
coords2 <- matrix(c(-79.25322, 25.76284,
                      -79.21888, 25.76624, 
                      -79.21459, 25.72063,
                      -79.25613, 25.72349),
                    ncol = 2, byrow = T)

coords2 <- rbind(coords2, coords2[1, ]) # Ensure the polygon is closed by repeating the first coordinate at the end

## Create polygon
pol_2 <- sf::st_sfc(sf::st_polygon(list(coords2)), crs = 4326)


# POL3
## Define pol boundaries
coords3 <- matrix(c(-79.25446, 25.7759, 
                      -79.26378, 25.70252, 
                      -79.31184,25.71772, 
                      -79.26071, 25.77646), 
                    ncol = 2, byrow = T)

coords3 <- rbind(coords3, coords3[1, ]) # Ensure the polygon is closed by repeating the first coordinate at the end

## Create polygon
pol_3 <- sf::st_sfc(sf::st_polygon(list(coords3)), crs = 4326)

I used the following code to individually create a new column with TRUE/FALSE if the data point is within one of the three polygons. I then used a for loop to fill in another column called loc that contains the nr. of the polygon.

### Check within which polygon each data point lies
test_sf <- sf::st_as_sf(testdata, coords = c("lon", "lat"), crs = 4326)
test_sf $in_one <- sf::st_within(test_sf , pol_1, sparse = FALSE)
test_sf $in_two <- sf::st_within(test_sf , pol_2, sparse = FALSE)
test_sf $in_three <- sf::st_within(test_sf , pol_3, sparse = FALSE)

### convert back to dataframe
test_loc<- st_drop_geometry(test_sf)

### initiate new column
test_loc$loc <- NA

### Loop over each row
for(i in 1:nrow(test_loc)) {
  # Loop over the columns (check each column for TRUE)
  if(test_loc[i, 3] == TRUE & test_loc[i,4] == FALSE & test_loc[i,5] == FALSE) {
      test_loc[i, 6] <- "One"} 
  else if(test_loc[i, 3] == FALSE & test_loc[i,4] == TRUE & test_loc[i,5] == FALSE) {
    test_loc[i, 6] <- "Two"} 
  else if(test_loc[i, 3] == FALSE & test_loc[i,4] == FALSE & test_loc[i,5] == TRUE) {
    test_loc[i, 6] <- "Three"} 
  else {NA}
}

However, this is a very long code, for what I think could be done more efficiently, if all three polygons are assessed in one step and the loc column is filled out across all three polygons together and the name of the polygon is used to populate the new columns. however, I am struggling with getting this done. Any help is much appreciated.


Solution

  • Described operation is usually referred to as a spatial join, and for this you usually want to deal with sf objects.

    I'd start by collecting those individual polygons into a single sf object (note that you have a small overlap there), add some identifying attribute and st_join this to the sf object of points.

    library(sf)
    #> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
    library(mapview)
    
    # collect polygon sfcs and create sf object
    (pol_objs <- ls(pattern = "^pol_\\d$") )
    #> [1] "pol_1" "pol_2" "pol_3"
    
    pol_sf <- 
      pol_objs |> 
      mget() |> 
      lapply(\(x) st_sf(geometry = x)) |> 
      do.call(what = rbind) |> 
      within(pol <-  pol_objs)
    pol_sf
    #> Simple feature collection with 3 features and 1 field
    #> Geometry type: POLYGON
    #> Dimension:     XY
    #> Bounding box:  xmin: -79.32764 ymin: 25.20054 xmax: -79.04101 ymax: 25.77646
    #> Geodetic CRS:  WGS 84
    #>                         geometry   pol
    #> 1 POLYGON ((-79.0877 25.7213,... pol_1
    #> 2 POLYGON ((-79.25322 25.7628... pol_2
    #> 3 POLYGON ((-79.25446 25.7759... pol_3
    
    test_sf <- 
      st_as_sf(testdata, coords = c("lon", "lat"), crs = 4326) |> 
      # spatial join, add atributes from intersecting features of pol_sf 
      st_join(pol_sf) |> 
      # factors for mapview legend order
      within(loc <- factor(pol, levels = paste0("pol_", 1:3), labels = c("One", "Two", "Three"))) |> 
      subset(select = -pol)
    test_sf
    #> Simple feature collection with 95 features and 1 field
    #> Geometry type: POINT
    #> Dimension:     XY
    #> Bounding box:  xmin: -79.3037 ymin: 25.23161 xmax: -79.23946 ymax: 25.77368
    #> Geodetic CRS:  WGS 84
    #> First 10 features:
    #>                      geometry loc
    #> 1  POINT (-79.30315 25.69511) One
    #> 2  POINT (-79.29561 25.69913) One
    #> 3  POINT (-79.29572 25.69909) One
    #> 4  POINT (-79.29833 25.69808) One
    #> 5  POINT (-79.29603 25.69791) One
    #> 6   POINT (-79.29659 25.6987) One
    #> 7   POINT (-79.29097 25.6976) One
    #> 8  POINT (-79.29347 25.69887) One
    #> 9  POINT (-79.29347 25.69887) One
    #> 10   POINT (-79.294 25.69812) One
    
    mapview(pol_sf, alpha.regions = .1) + mapview(test_sf)
    

    mapview screenshot

    Convert to regular frame, if needed:

    data.frame(
      st_drop_geometry(test_sf), 
      st_coordinates(test_sf) |> `colnames<-`(c("lon", "lat"))
    ) |> head()
    #>   loc       lon      lat
    #> 1 One -79.30315 25.69511
    #> 2 One -79.29561 25.69913
    #> 3 One -79.29572 25.69909
    #> 4 One -79.29833 25.69808
    #> 5 One -79.29603 25.69791
    #> 6 One -79.29659 25.69870