I have many .gpx track files where people made one track but covered distance in multiple treatment areas (within different polygons). I have a shapefile for each polygon and individual .gpx files. I have updated the code below as I am getting closer but I still don't know how to estimate time in a polygon if there is just one point (essentially the transition time).
Question 1: I need to ultimately calculate the time each track occurred in each polygon and the total distance each person walked within in polygon.
I can split the tracks like this:
library(sf) # For spatial data
library(lubridate) # For time manipulation
library(dplyr) # For data wrangling
library(geosphere) # For distance calculations
library(sp) # For spatial operations
library(tidyverse) # Optional, for convenience
library(trackeR) # Optional, for GPX parsing
#load tracks
gpx_points <- sf::read_sf("try1/Track_2023-10-20 FRINGE_cjh.gpx", layer = "track_points")
#load polygons
polygons <- st_read("try1/polygons_combo.shp") # or .shp
gpx_points <- st_transform(gpx_points, st_crs(polygons))
#create linestring
gpx_line <- gpx_points %>%
arrange(time) %>%
summarise(do_union = FALSE) %>%
st_cast("LINESTRING")
#split the linestring by polygons
segments <- st_intersection(gpx_line, polygons)
gpx_in_poly <- st_join(gpx_points, polygons, join = st_within)
gpx_in_poly1 <- gpx_in_poly %>%
arrange(time) %>%
group_by(SiteName) %>%
mutate(
next_time = lead(time),
next_geom = lead(geometry),
time_diff = as.numeric(difftime(next_time, time, units = "secs")),
dist_m = geosphere::distHaversine(
st_coordinates(geometry),
st_coordinates(next_geom)
)
) %>%
# Filter out segments with time gaps > 60 seconds
filter(time_diff <= 60) %>%
summarise(
total_time_sec = sum(time_diff, na.rm = TRUE),
total_distance_m = sum(dist_m, na.rm = TRUE)
)
gpx_in_poly1
# Quick visualization
library(ggplot2)
ggplot() +
geom_sf(data = polygons, fill = NA, color = "grey40") +
geom_sf(data = gpx_in_poly, aes(color = factor(SiteName)), size = 0.8) +
scale_color_viridis_d(na.value = "black") +
theme_minimal()
I now have the time spent and distance except I am pretty sure it excludes that one point in yellow since there would not be another point <60sec within the same SiteName. Any ideas on how I can improve this code to not lose that information or is this the best I can do? It's not too terrible to lose that accuracy but if I can avoid it that would be great.
Question 2: Additionally, I have ~100 .gpx files so it would be great to figure out a way to loop through all of them so I end up with a dataframe with the original filename and the total time, and total distance so I can evaluate effort for my capture rates.
I have one .gpx file and the polygon shapefile here for access
This is basically a trajectory analysis and it might be worth going through few related CRAN TaskViews to check what's already available (e.g. Tracking , SpatioTemporal ).
Here's one example using move2
with basic speed-based clustering / filtering to reduce low-speed distance noise a bit and with selective interpolation (main reason to opt for move2
for this) to include at least some duration time from segments that cross borders. But interpolation is just another estimation, while it reduces gap between total duration reported in GPX and sum of calculated durations, it might or might not provide meaningful results for your study.
Regarding data, I hope you are aware that those example polygons overlap, meaning that you can get multiple polygon matches for a trackpoint with st_join()
.
tmp_path <- fs::path_temp("tmp")
options(HTTPUserAgent = "libcurl") # as '"R (4.5.1 x86_64-w64-mingw32 x86_64 mingw32)"' is blocked ..
archive::archive_extract(
"https://figshare.com/ndownloader/articles/29998564?private_link=0e45ad939187bc62d870",
tmp_path)
shp_vsi <- paste0("/vsizip/", fs::path(tmp_path, "polygons_combo.zip"))
gpxs <- fs::dir_ls(tmp_path, glob = "*.gpx")
fs::dir_tree(tmp_path)
#> D:/rtmp/RtmpSmA4Zk/tmp
#> ├── polygons_combo.shp
#> ├── polygons_combo.zip
#> └── Track_2023-10-20 FRINGE_cjh.gpx
Provided GPX comes with some extra summary data in TrackStatsExtension:
xml2::read_xml(gpxs[1]) |>
xml2::xml_ns_strip() |>
xml2::xml_find_first("//gpxtrkx:TrackStatsExtension") |>
xml2::as_list() |> stack() |> _[1:4, 2:1]
#> ind values
#> 1 Distance 2923
#> 2 TotalElapsedTime 12191
#> 3 MovingTime 3936
#> 4 StoppedTime 7735
move2
trajectory object from trackpoints, basically an sf
object with with few extras.st_intersection()
.library(sf)
#> Linking to GEOS 3.13.1, GDAL 3.11.0, PROJ 9.6.0; sf_use_s2() is TRUE
library(move2)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(purrr)
library(lubridate, warn.conflicts = FALSE)
# to extract extension data from GPX
library(xml2)
# for visualization
library(mapview)
library(leafsync)
poly_sf <- read_sf(shp_vsi, query = "SELECT SiteName from polygons_combo")
# track for visualisations and for reference
trk_sf <- read_sf(gpxs[1], layer = "tracks")
# CRS of (valid) GPX is always WGS84, we can skip & ignore transformations for now
stopifnot(st_crs(trk_sf) == st_crs(poly_sf))
# create move2 trajectory object from GPX trackpoints
traj_mt <-
read_sf(gpxs[1], query = glue::glue("SELECT '{basename(gpxs[1])}' as gpx, time FROM track_points")) |>
mt_as_move2(time_column = "time", track_id_column = "gpx")
print(traj_mt, n = 3)
#> A <move2> with `track_id_column` "gpx" and `time_column` "time"
#> Containing 1 track lasting 3.38 hours in a
#> Simple feature collection with 532 features and 2 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: -119.2274 ymin: 34.24512 xmax: -119.2243 ymax: 34.24858
#> Geodetic CRS: WGS 84
#> # A tibble: 532 × 3
#> gpx time geometry
#> * <chr> <dttm> <POINT [°]>
#> 1 Track_2023-10-20 FRINGE_cjh.gpx 2023-10-20 20:44:04 (-119.2272 34.24804)
#> 2 Track_2023-10-20 FRINGE_cjh.gpx 2023-10-20 20:44:05 (-119.2272 34.24804)
#> 3 Track_2023-10-20 FRINGE_cjh.gpx 2023-10-20 20:44:35 (-119.2272 34.24804)
#> # ℹ 529 more rows
#> Track features:
#> gpx
#> 1 Track_2023-10-20 FRINGE_cjh.gpx
# speed-based filter to reduce low-speed clusters and distance noise a bit:
# flag points where speed (to next pt) is over threshold ->
# find threshold runs ...
traj_spd_mt <-
# magittr pipe for leaner nested calls
traj_mt %>%
mutate(
spd = mt_speed(., units = NULL) ,
spd_thres = spd > .4, # m/s
spd_thres_run = consecutive_id(spd_thres) |> factor() # factor for viz
)
# ... for runs bellow threshold, keep only 1st and last point,
# i.e. 1st and last cluster point are always kept while
# mid-sections for n > 2 clusters are dropped
traj_spd_flt_mt <-
traj_spd_mt |>
filter(spd_thres | ( row_number() %in% c(1, n()) ) , .by = spd_thres_run)
# compare pre- & post-filter:
m_base <-
mapview(poly_sf, zcol = "SiteName", alpha.regions = .2, legend = FALSE) +
mapview(trk_sf, alpha = .5, legend = FALSE)
at <- c(0, .2, .4, .8, Inf)
m_spd <- m_base + mapview(traj_spd_mt, zcol = "spd", at = at)
m_spd_flt <- m_base + mapview(traj_spd_flt_mt, zcol = "spd", at = at)
sync(m_spd, m_spd_flt)
To reduce amount of in-polygon time that is dropped when track segments that cross polygon borders are removed, we can interpolate points at lower time intervals (e.g. 10s). Processing whole trajectory would also work, but let's focus on relevant segments instead.
Interpolation approach itself may or may not work in your favor – in case of unfiltered errors (e.g. GPS-jumps) it can significantly increase reported distances.
# add polygon IDs to trackpoints
traj_spd_flt_mt <-
traj_spd_flt_mt |>
st_join(poly_sf)
# detect transition segements and interpolate
traj_ip_mt <-
traj_spd_flt_mt |>
mutate(
trans = SiteName != lag(SiteName, default = first(SiteName)) |
SiteName != lead(SiteName, default = last(SiteName))
) |>
group_by(trans, trans_id = consecutive_id(trans)) |>
group_map(\(x, grp_key){
# only modify transition groups
if (grp_key$trans) {
select(x, -SiteName) |>
mt_interpolate(time = "10 s") |>
st_join(poly_sf)
} else {
x
}
}) |>
# stack list back togeather and re-arrange points
mt_stack(.track_combine = "merge") |>
arrange(time)
# check interpolation
m_flt <- m_base + mapview(traj_spd_flt_mt, zcol = "SiteName")
m_ip <- m_base + mapview(traj_ip_mt, zcol = "SiteName")
sync(m_flt, m_ip)
For distances it would make sense to use whole set of filtered trackpoints, not point subsets from polygon intersections, this way we would avoid gaps when crossing polygon borders. As interpolated points are only added to straight segments, including / excluding those for distance calculation does not really matter.
# need to cast intersection MULTILINESTRINGs to LINESTRINGs to get
# proper section distances
sections_sf <-
traj_spd_flt_mt |>
mt_track_lines() |>
st_intersection(poly_sf) |>
st_cast("LINESTRING") |>
mutate(section = paste(SiteName, row_number()), .by = SiteName) |>
mutate(len = st_length(geometry))
sections_sf
#> Simple feature collection with 5 features and 4 fields
#> Geometry type: LINESTRING
#> Dimension: XY
#> Bounding box: xmin: -119.2274 ymin: 34.24512 xmax: -119.2243 ymax: 34.24858
#> Geodetic CRS: WGS 84
#> # A tibble: 5 × 5
#> gpx SiteName geometry section len
#> * <chr> <chr> <LINESTRING [°]> <chr> [m]
#> 1 Track_2023-10-20 FRINGE_cjh… Origin (-119.2272 34.24804, -11… Origin… 712.
#> 2 Track_2023-10-20 FRINGE_cjh… Origin (-119.2263 34.24838, -11… Origin… 478.
#> 3 Track_2023-10-20 FRINGE_cjh… Origin (-119.2271 34.24512, -11… Origin… 1320.
#> 4 Track_2023-10-20 FRINGE_cjh… Urban_A… (-119.2261 34.24827, -11… Urban_… 84.4
#> 5 Track_2023-10-20 FRINGE_cjh… Urban_A… (-119.2271 34.24513, -11… Urban_… 39.2
sections_sf |>
st_drop_geometry() |>
summarise(dist = sum(len), .by = c(gpx, SiteName))
#> # A tibble: 2 × 3
#> gpx SiteName dist
#> <chr> <chr> [m]
#> 1 Track_2023-10-20 FRINGE_cjh.gpx Origin 2510.
#> 2 Track_2023-10-20 FRINGE_cjh.gpx Urban_Agriculture 124.
m_base + mapview(sections_sf, zcol = "section")
For durations we’ll use the point set with interpolated trackpoints,
this allows us to get closer in time to the point when polygon border
might(!) have been crossed. Need to go through each individual in-polygon section to properly handle re-visits.
traj_ip_mt |>
st_drop_geometry() |>
as_tibble() |>
group_by(gpx, SiteName, site_section = consecutive_id(SiteName)) |>
# each summarise() drops one grouping level
summarise(sect_duration = as.duration(min(time) %--% max(time))) |>
summarise(duration_s = sum(sect_duration)) |>
ungroup()
#> # A tibble: 2 × 3
#> gpx SiteName duration_s
#> <chr> <chr> <dbl>
#> 1 Track_2023-10-20 FRINGE_cjh.gpx Origin 11806
#> 2 Track_2023-10-20 FRINGE_cjh.gpx Urban_Agriculture 321
traj_stats()
It also extracts duration / distance from GPX TrackStatsExtension for reference.
A somewhat more common move2
workflow would first load more / all trajectories, but processing single track at a time works too.
traj_stats <- function(gpx_f, poly_sf, poly_id_col = SiteName){
stats_ext <-
gpx_f |>
read_xml() |>
xml_ns_strip() |>
xml_find_first("//gpxtrkx:TrackStatsExtension") |>
as_list() |> stack() |> head(4) |>
pivot_wider(names_from = 2, values_from = 1)
traj_mt <-
# trackpoints
gpx_f |>
read_sf(query = glue::glue("SELECT '{basename(gpx_f)}' as gpx, time FROM track_points")) |>
# move2
mt_as_move2(time_column = "time", track_id_column = "gpx") %>%
# speed cluster / filter
mutate(
spd_thres = mt_speed(., units = NULL) > .4, # m/s
spd_thres_run = consecutive_id(spd_thres)
) |>
filter(spd_thres | ( row_number() %in% c(1, n()) ) , .by = spd_thres_run) |>
# corss-polygon transitions
st_join(poly_sf) |>
mutate(
trans = {{ poly_id_col }} != lag({{ poly_id_col }}, default = first({{ poly_id_col }})) |
{{ poly_id_col }} != lead({{ poly_id_col }}, default = last({{ poly_id_col }}))
) |>
# interpolate
group_by(trans, trans_id = consecutive_id(trans)) |>
group_map(\(x, grp_key){
if (grp_key$trans) {
select(x, -{{ poly_id_col }}) |>
mt_interpolate(time = "10 s") |>
st_join(poly_sf)
} else {
x
}
}) |>
mt_stack(.track_combine = "merge") |>
arrange(time)
summary_dist <-
traj_mt |>
mt_track_lines() |>
# to avoid "attribute variables are assumed to be spatially constant .. " warning
st_set_agr("constant") |>
st_intersection(st_set_agr(poly_sf, "constant")) |>
st_cast("LINESTRING", warn = FALSE) |>
mutate(section = paste({{ poly_id_col }}, row_number()), .by = {{ poly_id_col }}) |>
mutate(len = st_length(geometry)) |>
st_drop_geometry() |>
summarise(dist_m = sum(len) |> as.numeric(), .by = c(gpx, {{ poly_id_col }}))
summary_duration <-
traj_mt |>
st_drop_geometry() |>
as_tibble() |>
group_by(gpx, {{ poly_id_col }}, site_section = consecutive_id({{ poly_id_col }})) |>
summarise(sect_duration = as.duration(min(time) %--% max(time)), .groups = "drop_last") |>
summarise(dur_s = sum(sect_duration), .groups = "drop")
left_join(summary_dist, summary_duration, by = join_by(gpx, {{ poly_id_col }})) |>
mutate(ext = list(stats_ext)) |>
unnest_wider(ext, names_sep = "_")
}
Loop through a vector of gpx files:
# gpxs: vector of gpx file names, currently with just a single entry
gpxs |>
map(traj_stats, poly_sf = poly_sf) |>
list_rbind()
#> # A tibble: 2 × 8
#> gpx SiteName dist_m dur_s ext_Distance ext_TotalElapsedTime ext_MovingTime
#> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
#> 1 Track_… Origin 2510. 11806 2923 12191 3936
#> 2 Track_… Urban_A… 124. 321 2923 12191 3936
#> # ℹ 1 more variable: ext_StoppedTime <chr>