I have a dataframe with observations made at lat/lon coordinates. I have three polygons that span broad areas within which the observations were made. I want to create a grouping variable in a new column, that contains a short letter code for the polygon within which the observation was made. Here is my sample data:
> dput(testdata)
structure(list(lon = c(-79.30315, -79.29561, -79.29572, -79.29833,
-79.29603, -79.29659, -79.29097, -79.29347, -79.29347, -79.294,
-79.29644, -79.29653, -79.27027, -79.27656, -79.26938, -79.28683,
-79.29256, -79.28848, -79.29256, -79.29097, -79.292277, -79.29311,
-79.29259, -79.292277, -79.29654, -79.29123, -79.3037, -79.28743,
-79.29641, -79.29661, -79.29295, -79.29101, -79.2951, -79.29652,
-79.29035, -79.29183, -79.29661, -79.30317, -79.28734, -79.28734,
-79.296509, -79.24673, -79.24699, -79.24689, -79.24693, -79.26633,
-79.2585, -79.25765, -79.27642, -79.2754, -79.28565, -79.29443,
-79.28264, -79.24703, -79.24706, -79.24706, -79.247, -79.24863,
-79.30019, -79.3023, -79.30154, -79.24672, -79.24672, -79.24672,
-79.2969, -79.25789, -79.26389, -79.26274, -79.26976, -79.29689,
-79.26999, -79.29676, -79.29634, -79.29671, -79.29676, -79.29739,
-79.30154, -79.28073, -79.28101, -79.28164, -79.28447, -79.28244,
-79.28674, -79.28123, -79.28515, -79.29752, -79.29449, -79.30283,
-79.26671, -79.29492, -79.24765, -79.23946, -79.24728, -79.24826,
-79.2477), lat = c(25.69511, 25.69913, 25.69909, 25.69808, 25.69791,
25.6987, 25.6976, 25.69887, 25.69887, 25.69812, 25.698905, 25.69876,
25.69545, 25.69664, 25.69499, 25.69707, 25.69877, 25.69738, 25.69877,
25.69786, 25.698363, 25.69876, 25.69884, 25.698363, 25.699164,
25.69808, 25.69149, 25.69682, 25.69919, 25.69897, 25.6987, 25.69785,
25.69937, 25.69917, 25.69759, 25.69846, 25.69897, 25.69231, 25.69592,
25.69592, 25.699178, 25.73256, 25.73151, 25.73248, 25.7323, 25.73618,
25.75886, 25.76018, 25.69645, 25.69439, 25.69597, 25.69989, 25.69769,
25.73171, 25.73161, 25.73156, 25.23161, 25.73258, 25.69319, 25.6926,
25.69706, 25.73245, 25.73245, 25.73245, 25.70004, 25.77368, 25.7471,
25.72693, 25.72724, 25.6997, 25.72725, 25.69891, 25.69924, 25.69885,
25.69891, 25.69776, 25.69712, 25.72679, 25.72655, 25.73001, 25.72484,
25.72546, 25.72225, 25.72625, 25.72549, 25.69781, 25.69989, 25.69358,
25.69701, 25.69955, 25.73175, 25.73411, 25.73237, 25.73462, 25.73189
)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 40L, 41L, 42L, 43L,
44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L,
57L, 58L, 59L, 60L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L,
74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L,
87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L,
100L, 135L, 136L, 137L, 138L, 139L, 140L, 141L, 142L, 143L, 144L,
145L, 146L, 147L, 148L, 149L, 150L, 151L, 152L), class = "data.frame")
These are my three polygons:
#POL1
coords1 <- matrix(c(-79.0877, 25.7213,
-79.32764, 25.69931,
-79.32665, 25.20054,
-79.04101, 25.21296),
ncol = 2, byrow = T)
coords1 <- rbind(coords1, coords1[1, ]) # Ensure the polygon is closed by repeating the first coordinate at the end
### Create polygon
pol_1 <- sf::st_sfc(sf::st_polygon(list(coords1)), crs = 4326)
# POL2
## Define pol boundaries
coords2 <- matrix(c(-79.25322, 25.76284,
-79.21888, 25.76624,
-79.21459, 25.72063,
-79.25613, 25.72349),
ncol = 2, byrow = T)
coords2 <- rbind(coords2, coords2[1, ]) # Ensure the polygon is closed by repeating the first coordinate at the end
## Create polygon
pol_2 <- sf::st_sfc(sf::st_polygon(list(coords2)), crs = 4326)
# POL3
## Define pol boundaries
coords3 <- matrix(c(-79.25446, 25.7759,
-79.26378, 25.70252,
-79.31184,25.71772,
-79.26071, 25.77646),
ncol = 2, byrow = T)
coords3 <- rbind(coords3, coords3[1, ]) # Ensure the polygon is closed by repeating the first coordinate at the end
## Create polygon
pol_3 <- sf::st_sfc(sf::st_polygon(list(coords3)), crs = 4326)
I used the following code to individually create a new column with TRUE/FALSE if the data point is within one of the three polygons. I then used a for loop to fill in another column called loc
that contains the nr. of the polygon.
### Check within which polygon each data point lies
test_sf <- sf::st_as_sf(testdata, coords = c("lon", "lat"), crs = 4326)
test_sf $in_one <- sf::st_within(test_sf , pol_1, sparse = FALSE)
test_sf $in_two <- sf::st_within(test_sf , pol_2, sparse = FALSE)
test_sf $in_three <- sf::st_within(test_sf , pol_3, sparse = FALSE)
### convert back to dataframe
test_loc<- st_drop_geometry(test_sf)
### initiate new column
test_loc$loc <- NA
### Loop over each row
for(i in 1:nrow(test_loc)) {
# Loop over the columns (check each column for TRUE)
if(test_loc[i, 3] == TRUE & test_loc[i,4] == FALSE & test_loc[i,5] == FALSE) {
test_loc[i, 6] <- "One"}
else if(test_loc[i, 3] == FALSE & test_loc[i,4] == TRUE & test_loc[i,5] == FALSE) {
test_loc[i, 6] <- "Two"}
else if(test_loc[i, 3] == FALSE & test_loc[i,4] == FALSE & test_loc[i,5] == TRUE) {
test_loc[i, 6] <- "Three"}
else {NA}
}
However, this is a very long code, for what I think could be done more efficiently, if all three polygons are assessed in one step and the loc
column is filled out across all three polygons together and the name of the polygon is used to populate the new columns. however, I am struggling with getting this done. Any help is much appreciated.
Described operation is usually referred to as a spatial join, and for this you usually want to deal with sf
objects.
I'd start by collecting those individual polygons into a single sf
object (note that you have a small overlap there), add some identifying attribute and st_join
this to the sf
object of points.
library(sf)
#> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(mapview)
# collect polygon sfcs and create sf object
(pol_objs <- ls(pattern = "^pol_\\d$") )
#> [1] "pol_1" "pol_2" "pol_3"
pol_sf <-
pol_objs |>
mget() |>
lapply(\(x) st_sf(geometry = x)) |>
do.call(what = rbind) |>
within(pol <- pol_objs)
pol_sf
#> Simple feature collection with 3 features and 1 field
#> Geometry type: POLYGON
#> Dimension: XY
#> Bounding box: xmin: -79.32764 ymin: 25.20054 xmax: -79.04101 ymax: 25.77646
#> Geodetic CRS: WGS 84
#> geometry pol
#> 1 POLYGON ((-79.0877 25.7213,... pol_1
#> 2 POLYGON ((-79.25322 25.7628... pol_2
#> 3 POLYGON ((-79.25446 25.7759... pol_3
test_sf <-
st_as_sf(testdata, coords = c("lon", "lat"), crs = 4326) |>
# spatial join, add atributes from intersecting features of pol_sf
st_join(pol_sf) |>
# factors for mapview legend order
within(loc <- factor(pol, levels = paste0("pol_", 1:3), labels = c("One", "Two", "Three"))) |>
subset(select = -pol)
test_sf
#> Simple feature collection with 95 features and 1 field
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: -79.3037 ymin: 25.23161 xmax: -79.23946 ymax: 25.77368
#> Geodetic CRS: WGS 84
#> First 10 features:
#> geometry loc
#> 1 POINT (-79.30315 25.69511) One
#> 2 POINT (-79.29561 25.69913) One
#> 3 POINT (-79.29572 25.69909) One
#> 4 POINT (-79.29833 25.69808) One
#> 5 POINT (-79.29603 25.69791) One
#> 6 POINT (-79.29659 25.6987) One
#> 7 POINT (-79.29097 25.6976) One
#> 8 POINT (-79.29347 25.69887) One
#> 9 POINT (-79.29347 25.69887) One
#> 10 POINT (-79.294 25.69812) One
mapview(pol_sf, alpha.regions = .1) + mapview(test_sf)
Convert to regular frame, if needed:
data.frame(
st_drop_geometry(test_sf),
st_coordinates(test_sf) |> `colnames<-`(c("lon", "lat"))
) |> head()
#> loc lon lat
#> 1 One -79.30315 25.69511
#> 2 One -79.29561 25.69913
#> 3 One -79.29572 25.69909
#> 4 One -79.29833 25.69808
#> 5 One -79.29603 25.69791
#> 6 One -79.29659 25.69870