rdataframedplyrgroupingcluster-analysis

Efficiently group rows within tolerance for multiple numeric columns


I'm trying to group rows that have values within specific error/tolerance.

Input looks like this:

input <- data.frame(Row_number = 1:22,
                    Name = c(rep("A",6), rep("B",4), rep("A",12)),
                    Time1 = c(1236, 1236, 1235, 1237, 1287, 1288, 570, 1092, 1092, 1092, 1236, 1237, 1238, 2000, 2001, 2003, 1500, 1502, 1504, 3000, 3002, 3004),
                    Time2 = c(2.315, 2.317, 2.314, 3.012, 4.360, 4.358, 2.602, 2.603, 2.605, 2.610, 2.320, 2.360, 2.400, 5.000, 5.010, 5.015, 6.000, 6.050, 6.100, 1.000, 10.000, 1.040))

Row_number Name Time1  Time2
1    A  1236  2.315
2    A  1236  2.317
3    A  1235  2.314
4    A  1237  3.012
5    A  1287  4.360
6    A  1288  4.358
7    B   570  2.602
8    B  1092  2.603
9    B  1092  2.605
10    B  1092  2.610
11    A  1236  2.320
12    A  1237  2.360
13    A  1238  2.400
14    A  2000  5.000
15    A  2001  5.010
16    A  2003  5.015
17    A  1500  6.000
18    A  1502  6.050
19    A  1504  6.100
20    A  3000  1.000
21    A  3002 10.000
22    A  3004  1.040

And output would have "groups", like below:

Group_Name  Name  Time1  Time2  Row_number
A_1         A     1235   2.314           3
A_1         A     1236   2.315           1
A_1         A     1236   2.317           2
A_1         A     1236   2.320          11
A_1         A     1237   2.360          12
A_1         A     1238   2.400          13
A_2         A     1237   3.012           4
A_3         A     1287   4.360           5
A_3         A     1288   4.358           6
A_4         A     2000   5.000          14
A_4         A     2001   5.010          15
A_4         A     2003   5.015          16
A_5         A     1500   6.000          17
A_5         A     1502   6.050          18
A_5         A     1504   6.100          19
A_6         A     3000   1.000          20
A_7         A     3002  10.000          21
A_8         A     3004   1.040          22
B_1         B      570   2.602           7
B_2         B     1092   2.603           8
B_2         B     1092   2.605           9
B_2         B     1092   2.610          10

In this example, tolerance for Time1, and Time2 is 2 and 0.05 respectively.

If both Time1 and Time2 overlap within the tolerance, they should be in the same group (rows 1-3, 8-10). If Time1 overlaps but Time2 doesn't, they should be in separate groups (rows 1-3 vs row 4). Similarly, if Time2 overlaps but Time1 doesn't, they're separate groups (rows 20 vs 22). Considering transitive connections: If adjacent rows pass tolerance but non-adjacent rows don't, they should still be grouped together (rows 1-3 and 11-13, rows 14-16, rows 17-19). I added in rows 20-22 because I wanted to give an example where the igraph solution below wouldn't work, because it seems to group rows 20 and 22 into one, I think because it mutates grp2 within the grp1 cluster. I'm working on editing it from here, but if there are any other algorithms or approaches, I'm curious to see what they may be.


Solution

  • Considering transitive connections:

    Disjoint Set (Union-Find Data Structure)

    We can use disjoint set approach:

    library(dplyr)
    
    group_by_tolerance_unionfind <- function(df, time1_tol = 2, time2_tol = 0.05) {
      df |>
        group_by(Name) |>
        group_modify(~ {
          n <- nrow(.x)
          if (n == 1) {
            .x$Group_name <- paste0(.y$Name[1], "_1")
            return(.x)
          }
          
          parent <- 1:n
          
          find_root <- function(x) {
            if (parent[x] != x) {
              parent[x] <<- find_root(parent[x])
            }
            parent[x]
          }
          
          union_sets <- function(x, y) {
            root_x <- find_root(x)
            root_y <- find_root(y)
            if (root_x != root_y) {
              parent[root_x] <<- root_y
            }
          }
          
          for (i in 1:(n-1)) {
            for (j in (i+1):n) {
              if (abs(.x$Time1[i] - .x$Time1[j]) <= time1_tol &&
                  abs(.x$Time2[i] - .x$Time2[j]) <= time2_tol) {
                union_sets(i, j)
              }
            }
          }
          
          roots <- sapply(1:n, find_root)
          group_mapping <- as.numeric(factor(roots))
          .x$Group_name <- paste0(.y$Name[1], "_", group_mapping)
          .x
        }) |>
        ungroup()
    }
    
    {igraph}

    Or using {igraph} but creating the adjacency matrix more robustly:

    library(igraph)
    
    group_by_tolerance_igraph <- function(df, time1_tol = 2, time2_tol = 0.05) {
      df |>
        group_by(Name) |>
        group_modify(~ {
          n <- nrow(.x)
          adj_matrix <- outer(1:n, 1:n, function(i, j) {
            abs(.x$Time1[i] - .x$Time1[j]) <= time1_tol &
              abs(.x$Time2[i] - .x$Time2[j]) <= time2_tol
          })
          diag(adj_matrix) <- FALSE
          
          g <- graph_from_adjacency_matrix(adj_matrix, mode = "undirected")
          groups <- components(g)$membership
          .x$Group_name <- paste0(.y$Name[1], "_", groups)
          .x
        }) |>
        ungroup()
    }
    

    The group names/order between the two solutions is not the same, but the same rows are grouped/clustered together:

    group_by_tolerance_unionfind(input) |> arrange(Row_number)
    
    #> # A tibble: 22 × 5
    #>    Name  Row_number Time1 Time2 Group_name
    #>    <chr>      <int> <dbl> <dbl> <chr>     
    #>  1 A              1  1236  2.32 A_3       
    #>  2 A              2  1236  2.32 A_3       
    #>  3 A              3  1235  2.31 A_3       
    #>  4 A              4  1237  3.01 A_1       
    #>  5 A              5  1287  4.36 A_2       
    #>  6 A              6  1288  4.36 A_2       
    #>  7 B              7   570  2.60 B_1       
    #>  8 B              8  1092  2.60 B_2       
    #>  9 B              9  1092  2.60 B_2       
    #> 10 B             10  1092  2.61 B_2       
    #> # ℹ 12 more rows
    
    group_by_tolerance_igraph(input) |> arrange(Row_number)
    
    #> # A tibble: 22 × 5
    #>    Name  Row_number Time1 Time2 Group_name
    #>    <chr>      <int> <dbl> <dbl> <chr>     
    #>  1 A              1  1236  2.32 A_1       
    #>  2 A              2  1236  2.32 A_1       
    #>  3 A              3  1235  2.31 A_1       
    #>  4 A              4  1237  3.01 A_2       
    #>  5 A              5  1287  4.36 A_3       
    #>  6 A              6  1288  4.36 A_3       
    #>  7 B              7   570  2.60 B_1       
    #>  8 B              8  1092  2.60 B_2       
    #>  9 B              9  1092  2.60 B_2       
    #> 10 B             10  1092  2.61 B_2       
    #> # ℹ 12 more rows
    

    Benchmark:

    Benchmarking with the same large sample dataset as I shared below, we'll see that considering the transitive connections (exact tolerance) adds to the complexity of the problem and the solutions become order(s) of magnitude slower (although it's apples to oranges):

    bm <- microbenchmark::microbenchmark(
      DSU = group_by_tolerance_unionfind(large_dataset),
      iGraph = group_by_tolerance_igraph(large_dataset),
      times = 100
    )
    
    ggplot2::autoplot(bm) +
      ggplot2::theme_bw()
    

    Created on 2025-08-21 with reprex v2.1.1



    Distance-based clustering:

    {cluster}

    This is basically a cluster analysis problem and we can use {cluster}:

    library(cluster)
    library(dplyr)
    
    time1_tol = 2
    time2_tol = 0.05
    
    input |>
      group_by(Name) |>
      group_modify(~ {
        scaled_data <- cbind(.x$Time1 / time1_tol, .x$Time2 / time2_tol)
        dist_matrix <- dist(scaled_data, method = "manhattan")
        hc <- hclust(dist_matrix, method = "single")
        groups <- cutree(hc, h = 1)
        .x$Group_name <- paste0(.y$Name[1], "-", groups)
        .x
      }) |>
      ungroup() |>
      select(Group_name, everything(), -Row_number)
    
    #> # A tibble: 10 × 4
    #>    Group_name Name  Time1 Time2
    #>    <chr>      <chr> <dbl> <dbl>
    #>  1 A-1        A      1236  2.32
    #>  2 A-1        A      1236  2.32
    #>  3 A-1        A      1235  2.31
    #>  4 A-2        A      1237  3.01
    #>  5 A-3        A      1287  4.36
    #>  6 A-3        A      1288  4.36
    #>  7 B-1        B       570  2.60
    #>  8 B-2        B      1092  2.60
    #>  9 B-2        B      1092  2.60
    #> 10 B-2        B      1092  2.61
    
    {dbscan}

    Or use {dbscan} which is more efficient:

    library(dbscan)
    
    input |>
      group_by(Name) |>
      group_modify(~ {
        scaled_data <- cbind(.x$Time1 / time1_tol, .x$Time2 / time2_tol)
        db <- dbscan(scaled_data, eps = 1, minPts = 1)
        .x$Group_name <- paste0(.y$Name[1], "-", db$cluster)
        .x
      }) |>
      ungroup() |>
      select(Group_name, everything(), -Row_number)
    
    #> # A tibble: 10 × 4
    #>    Group_name Name  Time1 Time2
    #>    <chr>      <chr> <dbl> <dbl>
    #>  1 A-1        A      1236  2.32
    #>  2 A-1        A      1236  2.32
    #>  3 A-1        A      1235  2.31
    #>  4 A-2        A      1237  3.01
    #>  5 A-3        A      1287  4.36
    #>  6 A-3        A      1288  4.36
    #>  7 B-1        B       570  2.60
    #>  8 B-2        B      1092  2.60
    #>  9 B-2        B      1092  2.60
    #> 10 B-2        B      1092  2.61
    

    Benchmark:

    library(dplyr)
    library(dbscan)
    library(cluster)
    
    ## large test dataset
    set.seed(42)
    n_groups <- 26
    min_obs <- 50 
    max_obs <- 200
    group_names <- LETTERS[1:n_groups]
    data_list <- list()
    row_counter <- 1
    for (group in group_names) {
      n_obs <- sample(min_obs:max_obs, 1)
      n_clusters <- sample(2:5, 1)
      cluster_centers_time1 <- sample(500:2000, n_clusters, replace = FALSE)
      cluster_centers_time2 <- runif(n_clusters, 1, 5)
      group_data <- data.frame()
      for (i in 1:n_clusters) {
        cluster_size <- max(1, round(n_obs / n_clusters + rnorm(1, 0, 5)))
        time1_vals <- cluster_centers_time1[i] + rnorm(cluster_size, 0, 0.8)  
        time2_vals <- cluster_centers_time2[i] + rnorm(cluster_size, 0, 0.02) 
        outlier_indices <- sample(1:cluster_size, 
                                  max(1, round(cluster_size * 0.1)))
        time1_vals[outlier_indices] <- time1_vals[outlier_indices] + 
          sample(c(-1, 1), length(outlier_indices), replace = TRUE) * 
          runif(length(outlier_indices), 5, 20)
        time2_vals[outlier_indices] <- time2_vals[outlier_indices] + 
          sample(c(-1, 1), length(outlier_indices), replace = TRUE) * 
          runif(length(outlier_indices), 0.1, 0.5)
        cluster_data <- data.frame(
          Row_number = row_counter:(row_counter + cluster_size - 1),
          Name = group,
          Time1 = time1_vals,
          Time2 = time2_vals
        )
        group_data <- rbind(group_data, cluster_data)
        row_counter <- row_counter + cluster_size
      }
      data_list[[group]] <- group_data
    }
    large_dataset <- do.call(rbind, data_list)
    rownames(large_dataset) <- NULL
    
    ## DBSCAN
    group_by_tolerance_dbscan <- function(df, time1_tol = 2, time2_tol = 0.05) {
      df |>
        group_by(Name) |>
        group_modify(~ {
          if (nrow(.x) == 1) {
            .x$Group_name <- paste0(.y$Name[1], "-1")
            return(.x)
          }
          scaled_data <- cbind(.x$Time1 / time1_tol, .x$Time2 / time2_tol)
          db <- dbscan(scaled_data, eps = 1, minPts = 1)
          .x$Group_name <- paste0(.y$Name[1], "-", db$cluster)
          .x
        }) |>
        ungroup()
    }
    
    ## Hierarchical clustering
    group_by_tolerance_hclust <- function(df, time1_tol = 2, time2_tol = 0.05) {
      df |>
        group_by(Name) |>
        group_modify(~ {
          if (nrow(.x) == 1) {
            .x$Group_name <- paste0(.y$Name[1], "-1")
            return(.x)
          }
          scaled_data <- cbind(.x$Time1 / time1_tol, .x$Time2 / time2_tol)
          dist_matrix <- dist(scaled_data, method = "euclidean")
          hc <- hclust(dist_matrix, method = "single")
          groups <- cutree(hc, h = 1)
          .x$Group_name <- paste0(.y$Name[1], "-", groups)
          .x
        }) |>
        ungroup()
    }
    
    bm <- microbenchmark::microbenchmark(
      DBSCAN = group_by_tolerance_dbscan(large_dataset),
      Hierarchical = group_by_tolerance_hclust(large_dataset),
      times = 100, check = "equal"
      )
    
    ggplot2::autoplot(bm) +
      ggplot2::theme_bw()
    

    Created on 2025-08-18 with reprex v2.1.1