I'm trying to group rows that have values within specific error/tolerance.
Input looks like this:
input <- data.frame(Row_number = 1:22,
Name = c(rep("A",6), rep("B",4), rep("A",12)),
Time1 = c(1236, 1236, 1235, 1237, 1287, 1288, 570, 1092, 1092, 1092, 1236, 1237, 1238, 2000, 2001, 2003, 1500, 1502, 1504, 3000, 3002, 3004),
Time2 = c(2.315, 2.317, 2.314, 3.012, 4.360, 4.358, 2.602, 2.603, 2.605, 2.610, 2.320, 2.360, 2.400, 5.000, 5.010, 5.015, 6.000, 6.050, 6.100, 1.000, 10.000, 1.040))
Row_number Name Time1 Time2
1 A 1236 2.315
2 A 1236 2.317
3 A 1235 2.314
4 A 1237 3.012
5 A 1287 4.360
6 A 1288 4.358
7 B 570 2.602
8 B 1092 2.603
9 B 1092 2.605
10 B 1092 2.610
11 A 1236 2.320
12 A 1237 2.360
13 A 1238 2.400
14 A 2000 5.000
15 A 2001 5.010
16 A 2003 5.015
17 A 1500 6.000
18 A 1502 6.050
19 A 1504 6.100
20 A 3000 1.000
21 A 3002 10.000
22 A 3004 1.040
And output would have "groups", like below:
Group_Name Name Time1 Time2 Row_number
A_1 A 1235 2.314 3
A_1 A 1236 2.315 1
A_1 A 1236 2.317 2
A_1 A 1236 2.320 11
A_1 A 1237 2.360 12
A_1 A 1238 2.400 13
A_2 A 1237 3.012 4
A_3 A 1287 4.360 5
A_3 A 1288 4.358 6
A_4 A 2000 5.000 14
A_4 A 2001 5.010 15
A_4 A 2003 5.015 16
A_5 A 1500 6.000 17
A_5 A 1502 6.050 18
A_5 A 1504 6.100 19
A_6 A 3000 1.000 20
A_7 A 3002 10.000 21
A_8 A 3004 1.040 22
B_1 B 570 2.602 7
B_2 B 1092 2.603 8
B_2 B 1092 2.605 9
B_2 B 1092 2.610 10
In this example, tolerance for Time1
, and Time2
is 2 and 0.05 respectively.
If both Time1
and Time2
overlap within the tolerance, they should be in the same group (rows 1-3, 8-10).
If Time1
overlaps but Time2
doesn't, they should be in separate groups (rows 1-3 vs row 4). Similarly, if Time2
overlaps but Time1
doesn't, they're separate groups (rows 20 vs 22).
Considering transitive connections:
If adjacent rows pass tolerance but non-adjacent rows don't, they should still be grouped together (rows 1-3 and 11-13, rows 14-16, rows 17-19).
I added in rows 20-22 because I wanted to give an example where the igraph
solution below wouldn't work, because it seems to group rows 20 and 22 into one, I think because it mutates grp2
within the grp1
cluster. I'm working on editing it from here, but if there are any other algorithms or approaches, I'm curious to see what they may be.
We can use disjoint set approach:
library(dplyr)
group_by_tolerance_unionfind <- function(df, time1_tol = 2, time2_tol = 0.05) {
df |>
group_by(Name) |>
group_modify(~ {
n <- nrow(.x)
if (n == 1) {
.x$Group_name <- paste0(.y$Name[1], "_1")
return(.x)
}
parent <- 1:n
find_root <- function(x) {
if (parent[x] != x) {
parent[x] <<- find_root(parent[x])
}
parent[x]
}
union_sets <- function(x, y) {
root_x <- find_root(x)
root_y <- find_root(y)
if (root_x != root_y) {
parent[root_x] <<- root_y
}
}
for (i in 1:(n-1)) {
for (j in (i+1):n) {
if (abs(.x$Time1[i] - .x$Time1[j]) <= time1_tol &&
abs(.x$Time2[i] - .x$Time2[j]) <= time2_tol) {
union_sets(i, j)
}
}
}
roots <- sapply(1:n, find_root)
group_mapping <- as.numeric(factor(roots))
.x$Group_name <- paste0(.y$Name[1], "_", group_mapping)
.x
}) |>
ungroup()
}
{igraph}
Or using {igraph}
but creating the adjacency matrix more robustly:
library(igraph)
group_by_tolerance_igraph <- function(df, time1_tol = 2, time2_tol = 0.05) {
df |>
group_by(Name) |>
group_modify(~ {
n <- nrow(.x)
adj_matrix <- outer(1:n, 1:n, function(i, j) {
abs(.x$Time1[i] - .x$Time1[j]) <= time1_tol &
abs(.x$Time2[i] - .x$Time2[j]) <= time2_tol
})
diag(adj_matrix) <- FALSE
g <- graph_from_adjacency_matrix(adj_matrix, mode = "undirected")
groups <- components(g)$membership
.x$Group_name <- paste0(.y$Name[1], "_", groups)
.x
}) |>
ungroup()
}
The group names/order between the two solutions is not the same, but the same rows are grouped/clustered together:
group_by_tolerance_unionfind(input) |> arrange(Row_number)
#> # A tibble: 22 × 5
#> Name Row_number Time1 Time2 Group_name
#> <chr> <int> <dbl> <dbl> <chr>
#> 1 A 1 1236 2.32 A_3
#> 2 A 2 1236 2.32 A_3
#> 3 A 3 1235 2.31 A_3
#> 4 A 4 1237 3.01 A_1
#> 5 A 5 1287 4.36 A_2
#> 6 A 6 1288 4.36 A_2
#> 7 B 7 570 2.60 B_1
#> 8 B 8 1092 2.60 B_2
#> 9 B 9 1092 2.60 B_2
#> 10 B 10 1092 2.61 B_2
#> # ℹ 12 more rows
group_by_tolerance_igraph(input) |> arrange(Row_number)
#> # A tibble: 22 × 5
#> Name Row_number Time1 Time2 Group_name
#> <chr> <int> <dbl> <dbl> <chr>
#> 1 A 1 1236 2.32 A_1
#> 2 A 2 1236 2.32 A_1
#> 3 A 3 1235 2.31 A_1
#> 4 A 4 1237 3.01 A_2
#> 5 A 5 1287 4.36 A_3
#> 6 A 6 1288 4.36 A_3
#> 7 B 7 570 2.60 B_1
#> 8 B 8 1092 2.60 B_2
#> 9 B 9 1092 2.60 B_2
#> 10 B 10 1092 2.61 B_2
#> # ℹ 12 more rows
Benchmarking with the same large sample dataset as I shared below, we'll see that considering the transitive connections (exact tolerance) adds to the complexity of the problem and the solutions become order(s) of magnitude slower (although it's apples to oranges):
bm <- microbenchmark::microbenchmark(
DSU = group_by_tolerance_unionfind(large_dataset),
iGraph = group_by_tolerance_igraph(large_dataset),
times = 100
)
ggplot2::autoplot(bm) +
ggplot2::theme_bw()
Created on 2025-08-21 with reprex v2.1.1
{cluster}
This is basically a cluster analysis problem and we can use {cluster}
:
library(cluster)
library(dplyr)
time1_tol = 2
time2_tol = 0.05
input |>
group_by(Name) |>
group_modify(~ {
scaled_data <- cbind(.x$Time1 / time1_tol, .x$Time2 / time2_tol)
dist_matrix <- dist(scaled_data, method = "manhattan")
hc <- hclust(dist_matrix, method = "single")
groups <- cutree(hc, h = 1)
.x$Group_name <- paste0(.y$Name[1], "-", groups)
.x
}) |>
ungroup() |>
select(Group_name, everything(), -Row_number)
#> # A tibble: 10 × 4
#> Group_name Name Time1 Time2
#> <chr> <chr> <dbl> <dbl>
#> 1 A-1 A 1236 2.32
#> 2 A-1 A 1236 2.32
#> 3 A-1 A 1235 2.31
#> 4 A-2 A 1237 3.01
#> 5 A-3 A 1287 4.36
#> 6 A-3 A 1288 4.36
#> 7 B-1 B 570 2.60
#> 8 B-2 B 1092 2.60
#> 9 B-2 B 1092 2.60
#> 10 B-2 B 1092 2.61
{dbscan}
Or use {dbscan}
which is more efficient:
library(dbscan)
input |>
group_by(Name) |>
group_modify(~ {
scaled_data <- cbind(.x$Time1 / time1_tol, .x$Time2 / time2_tol)
db <- dbscan(scaled_data, eps = 1, minPts = 1)
.x$Group_name <- paste0(.y$Name[1], "-", db$cluster)
.x
}) |>
ungroup() |>
select(Group_name, everything(), -Row_number)
#> # A tibble: 10 × 4
#> Group_name Name Time1 Time2
#> <chr> <chr> <dbl> <dbl>
#> 1 A-1 A 1236 2.32
#> 2 A-1 A 1236 2.32
#> 3 A-1 A 1235 2.31
#> 4 A-2 A 1237 3.01
#> 5 A-3 A 1287 4.36
#> 6 A-3 A 1288 4.36
#> 7 B-1 B 570 2.60
#> 8 B-2 B 1092 2.60
#> 9 B-2 B 1092 2.60
#> 10 B-2 B 1092 2.61
library(dplyr)
library(dbscan)
library(cluster)
## large test dataset
set.seed(42)
n_groups <- 26
min_obs <- 50
max_obs <- 200
group_names <- LETTERS[1:n_groups]
data_list <- list()
row_counter <- 1
for (group in group_names) {
n_obs <- sample(min_obs:max_obs, 1)
n_clusters <- sample(2:5, 1)
cluster_centers_time1 <- sample(500:2000, n_clusters, replace = FALSE)
cluster_centers_time2 <- runif(n_clusters, 1, 5)
group_data <- data.frame()
for (i in 1:n_clusters) {
cluster_size <- max(1, round(n_obs / n_clusters + rnorm(1, 0, 5)))
time1_vals <- cluster_centers_time1[i] + rnorm(cluster_size, 0, 0.8)
time2_vals <- cluster_centers_time2[i] + rnorm(cluster_size, 0, 0.02)
outlier_indices <- sample(1:cluster_size,
max(1, round(cluster_size * 0.1)))
time1_vals[outlier_indices] <- time1_vals[outlier_indices] +
sample(c(-1, 1), length(outlier_indices), replace = TRUE) *
runif(length(outlier_indices), 5, 20)
time2_vals[outlier_indices] <- time2_vals[outlier_indices] +
sample(c(-1, 1), length(outlier_indices), replace = TRUE) *
runif(length(outlier_indices), 0.1, 0.5)
cluster_data <- data.frame(
Row_number = row_counter:(row_counter + cluster_size - 1),
Name = group,
Time1 = time1_vals,
Time2 = time2_vals
)
group_data <- rbind(group_data, cluster_data)
row_counter <- row_counter + cluster_size
}
data_list[[group]] <- group_data
}
large_dataset <- do.call(rbind, data_list)
rownames(large_dataset) <- NULL
## DBSCAN
group_by_tolerance_dbscan <- function(df, time1_tol = 2, time2_tol = 0.05) {
df |>
group_by(Name) |>
group_modify(~ {
if (nrow(.x) == 1) {
.x$Group_name <- paste0(.y$Name[1], "-1")
return(.x)
}
scaled_data <- cbind(.x$Time1 / time1_tol, .x$Time2 / time2_tol)
db <- dbscan(scaled_data, eps = 1, minPts = 1)
.x$Group_name <- paste0(.y$Name[1], "-", db$cluster)
.x
}) |>
ungroup()
}
## Hierarchical clustering
group_by_tolerance_hclust <- function(df, time1_tol = 2, time2_tol = 0.05) {
df |>
group_by(Name) |>
group_modify(~ {
if (nrow(.x) == 1) {
.x$Group_name <- paste0(.y$Name[1], "-1")
return(.x)
}
scaled_data <- cbind(.x$Time1 / time1_tol, .x$Time2 / time2_tol)
dist_matrix <- dist(scaled_data, method = "euclidean")
hc <- hclust(dist_matrix, method = "single")
groups <- cutree(hc, h = 1)
.x$Group_name <- paste0(.y$Name[1], "-", groups)
.x
}) |>
ungroup()
}
bm <- microbenchmark::microbenchmark(
DBSCAN = group_by_tolerance_dbscan(large_dataset),
Hierarchical = group_by_tolerance_hclust(large_dataset),
times = 100, check = "equal"
)
ggplot2::autoplot(bm) +
ggplot2::theme_bw()
Created on 2025-08-18 with reprex v2.1.1