rstringperformance

Fast pairwise longest common substring from start


Suppose there are many binary strings

x <- c("0100100010101010", "0100110010101010","0111001000","010111")

I am looking for a fast method in R to output a matrix containing the pairwise (excluding self-matches) longest common substring from the start. For example, a solution could look like this

> mySolution(x)
    [,1]     [,2]      [,3]      [,4]
[1,] ""       "01001"   "01"     "010"
[2,] "01001"  ""        "01"     "010"
[3,] "01"     "01"      ""       "01"
[4,] "010"    "010"     "01"     ""

E.g., the matrix at position [1,2] is the longest common substring from the start of x[1] and x[2]. Note that the resulting matrix is symmetric. So, we only need to compute one half of it.

I know that could work with functions like substr,sub,grepl, but since I have many strings, I am looking for a very efficient solution that requires little computation time. Maybe it is an option to convert to binary numbers to improve performance?


Solution

  • Update

    An even faster solution for large vectors of binary strings. The function flcs below is able to perform the desired operation on 10k binary character strings in about 1 second.

    The idea is that if two binary strings are seen as big-endian representations of integer values, then the absolute value of the difference of the two values will have a number of leading zeros equal to their longest common series of bits from the beginning.

    Demonstrating:

    as.integer(intToBits(29))
    #>  [1] 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    as.integer(intToBits(13))
    #>  [1] 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    as.integer(intToBits(abs(29 - 13)))
    #>  [1] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    

    The big-endian binary representation of 16 has 4 leading zeros, which is the number of same leading bits between 29 and 13 (see https://oeis.org/A007814).

    One option is to convert each string to integer, calculate all pairwise distances, then determine the number of leading zeros. However, since the length of the strings can exceed 31, this would require working with arbitrarily large integers using, e.g., gmp::bigz, which is relatively slow.

    The algorithm used by flcs first pre-computes the number of leading zeros for all binary strings of length d. Then the strings are broken into substrings of length d and sequentially checked via differencing and indexing until there is no longer a complete match. (The algorithm is similar to the previous answer, but it processes d characters at once.) All computations are performed on base R integer values, so the algorithm is very fast.

    system.time({
      d <- 12L
      nn <- integer(2^d)
      d2 <- 2^(0:d)
      nn[sequence(d2[(d - 1):1], d2[2:d] + 1, d2[3:(d + 1)])] <-
        rep.int(1:(d - 1), d2[(d - 1):1])
      nn[1] <- d
    })
    #>    user  system elapsed 
    #>       0       0       0
    

    flcs returns the endpoint of the pairwise common substring, which can be used to fill the bottom triangle of the desired matrix.

    library(data.table) # for `tstrsplit`
    library(stringi) # for `stri_reverse`
    
    flcs <- function(x) {
      nx <- length(x)
      nx1 <- nx - 1L
      j2 <- sequence(nx1:1, 2:nx)
      y <- tstrsplit(gsub(sprintf("(.{%d})", d), "\\1 ", x), " ")
      y[[1]] <- strtoi(stri_reverse(y[[1]]), base = 2)
      n <- nn[abs(rep.int(y[[1]][-nx], nx1:1) - y[[1]][j2]) + 1L]
      idx <- which(n == d)
    
      if (length(idx)) {
        j1 <- rep.int(1:nx1, nx1:1)
    
        for (k in (1:length(y))[-1]) {
          y[[k]] <- strtoi(stri_reverse(y[[k]]), base = 2)
          j3 <- abs(y[[k]][j1[idx]] - y[[k]][j2[idx]])
          idx <- idx[b <- !is.na(j3)]
          nplus <- nn[j3[b] + 1L]
          n[idx] <- n[idx] + nplus
          idx <- idx[nplus == d]
          if (!length(idx)) break
        }
      }
    
      lens <- nchar(x)
      pmin(n, rep.int(lens[-nx], nx1:1), lens[j2])
    }
    

    Demonstrating:

    x <- c("0100100010101010", "0100110010101010","0111001000","010111")
    flcs(x)
    #> [1] 5 2 3 2 3 2
    

    A function to build the final matrix from the output of flcs:

    buildmat <- function(x, n, symmetric = FALSE, fill = "") {
      nx <- length(x)
      nx1 <- nx - 1L
      z <- matrix(fill, nx, nx)
      if (symmetric) {
        z[sequence(nx1:1, seq(2, nx^2, nx + 1))] <-
          z[sequence(nx1:1, seq(nx + 1, nx^2, nx + 1), nx)] <-
          substr(x[rep.int(1:nx1, nx1:1)], 1, n)
      } else {
        z[sequence(nx1:1, seq(2, nx^2, nx + 1))] <-
          substr(x[rep.int(1:nx1, nx1:1)], 1, n)
      }
      
      z
    }
    
    buildmat(x, flcs(x), TRUE)
    #>      [,1]    [,2]    [,3] [,4] 
    #> [1,] ""      "01001" "01" "010"
    #> [2,] "01001" ""      "01" "010"
    #> [3,] "01"    "01"    ""   "01" 
    #> [4,] "010"   "010"   "01" ""
    

    Compare to three other approaches. The first compares each string to each other string in parallel. The other two are from the answers given by @ThomasIsCoding and @AndreWildberg.

    library(parallel)
    
    f <- function(x, y) {
      n <- min(length(x[[1]]), length(x[[2]]))
      out <- which.max(x[[1]][1:n] != x[[2]][1:n])
      if (out == 1L && x[[1]][1] == x[[2]][1]) n + 1L else out
    }
    
    cl <- makeCluster(detectCores() - 1) # 15 cores
    clusterExport(cl, c("f"), environment(f))
    
    flcsPar <- function(x) {
      unlist(parLapply(cl, combn(lapply(x, utf8ToInt), 2, NULL, FALSE), f)) - 1L
    }
    
    getPsub <- function(dat){ # from @AndreWildberg
      len <- seq_along(dat)
      mlen <- max(len)
      mat <- matrix(NA, mlen, mlen)
      d <- do.call(rbind, tstrsplit(dat, ""))
      mat[lower.tri(mat)] <- apply(combn(len, 2), 2, \(x){
        res <- d[,x[1]] == d[,x[2]]
        paste(d[rleid(res) == 1 & res, x[1]], collapse="")})
      mat
    }
    
    helper <- function(a, b) { # from @ThomasIsCoding
      l <- min(nchar(a), nchar(b))
      aa <- substr(a, 1, l)
      bb <- substr(b, 1, l)
      vaa <- utf8ToInt(aa)
      vbb <- utf8ToInt(bb)
      if (aa == bb) {
        aa
      } else {
        k <- which.max(vaa != vbb) - 1
        ifelse(k > 0, intToUtf8(vaa[1:k]), "")
      }
    }
    
    f2 <- function(x) { # from @ThomasIsCoding
      res <- matrix(rep(NA, length(x)^2), length(x))
      v <- combn(x, 2, \(p) helper(p[1], p[2]))
      res[lower.tri(res)] <- v
      res
    }
    

    The functions by @ThomasIsCoding and @AndreWildberg have been modified slightly to give identical outputs:

    buildmat(x, flcsPar(x), FALSE, NA)
    #>      [,1]    [,2]  [,3] [,4]
    #> [1,] NA      NA    NA   NA  
    #> [2,] "01001" NA    NA   NA  
    #> [3,] "01"    "01"  NA   NA  
    #> [4,] "010"   "010" "01" NA
    getPsub(x)
    #>      [,1]    [,2]  [,3] [,4]
    #> [1,] NA      NA    NA   NA  
    #> [2,] "01001" NA    NA   NA  
    #> [3,] "01"    "01"  NA   NA  
    #> [4,] "010"   "010" "01" NA
    f2(x)
    #>      [,1]    [,2]  [,3] [,4]
    #> [1,] NA      NA    NA   NA  
    #> [2,] "01001" NA    NA   NA  
    #> [3,] "01"    "01"  NA   NA  
    #> [4,] "010"   "010" "01" NA
    

    Time the four solutions with a character vector of length 10k, with the string lengths varying from 10 to 64.

    x <- vapply(1:1e4, \(.) intToUtf8(sample(48:49, sample(10:64, 1), 1)), "")
    system.time(m1 <- buildmat(x, flcsPar(x), FALSE, NA))
    #>    user  system elapsed 
    #>   66.61   60.50  363.88
    stopCluster(cl)
    system.time(m2 <- buildmat(x, flcs(x), FALSE, NA))
    #>    user  system elapsed 
    #>    3.47    1.25    4.69
    identical(m1, m2)
    #> [1] TRUE
    system.time(m2 <- getPsub(x))
    #>    user  system elapsed 
    #>  596.86   59.57  661.10
    identical(m1, m2)
    #> [1] TRUE
    system.time(m2 <- f2(x))
    #>    user  system elapsed 
    #>  378.62    5.30  383.78
    identical(m1, m2)
    #> [1] TRUE
    

    If only the pairwise lengths are needed, there is no need for buildmat:

    system.time(flcs(x))
    #>    user  system elapsed 
    #>    0.84    0.31    1.16
    

    flcs performed nearly 50 million comparisons in about 1 second.


    Original Answer

    A fast solution for large vectors of binary strings. The function flcs below is able to perform the desired operation on 10k binary character strings in about 13 seconds.

    The idea is to sequentially check the kth character of each string and put each string into a 0 bin or a 1 bin. The longest common substring between i and j terminates at or before k if i and j are in different bins.

    flcs returns the endpoint of the pairwise common substring, which can be used to fill the bottom triangle of the desired matrix.

    library(data.table)
    
    flcs <- function(x) {
      y <- lapply(x, \(x) utf8ToInt(x) == 49L)
      nx <- length(x)
      nx1 <- nx - 1L
      lens <- lengths(y)
      dt <- data.table(
        v = unlist(y),
        i = sequence(lens),
        j1 = rep.int(1:nx, lens)
      )[,j2 := (j1 - 1L)*nx]
      # `n` is a vector containing the earliest mismatch between pairs
      n <- pmin(lens[rep.int(1:nx1, nx1:1)], lens[sequence(nx1:1, 2:nx)])
      b <- logical(length(n))
      # create a vector `idx` to map to the indices of `n`
      idx <- integer(nx^2)
      idx[sequence(nx1:1, seq(2, nx^2, nx + 1))] <-
        idx[sequence(nx1:1, seq(nx + 1, nx^2, nx + 1), nx)] <- 1:length(n)
      
      for (k in 1:-sort.int(-lens, 2)[2]) {
        nstop <- idx[dt[i == k, outer(j2[v], j1[!v], "+")]]
        nstop <- nstop[!b[nstop]]
        b[nstop] <- TRUE
        n[nstop] <- k - 1L
        if (all(b)) break
      }
      
      n
    }
    

    Demonstrating:

    x <- c("0100100010101010", "0100110010101010","0111001000","010111")
    flcs(x)
    #> [1] 5 2 3 2 3 2
    

    A function to build the final matrix from the output of flcs:

    buildmat <- function(x, n, symmetric = FALSE) {
      nx <- length(x)
      nx1 <- nx - 1L
      z <- matrix("", nx, nx)
      if (symmetric) {
        z[sequence(nx1:1, seq(2, nx^2, nx + 1))] <-
          z[sequence(nx1:1, seq(nx + 1, nx^2, nx + 1), nx)] <-
          substr(x[rep.int(1:nx1, nx1:1)], 1, n)
      } else {
        z[sequence(nx1:1, seq(2, nx^2, nx + 1))] <-
          substr(x[rep.int(1:nx1, nx1:1)], 1, n)
      }
      
      z
    }
    
    buildmat(x, flcs(x), TRUE)
    #>      [,1]    [,2]    [,3] [,4] 
    #> [1,] ""      "01001" "01" "010"
    #> [2,] "01001" ""      "01" "010"
    #> [3,] "01"    "01"    ""   "01" 
    #> [4,] "010"   "010"   "01" ""
    

    Compare to a solution that compares each string to each other string in parallel.

    library(parallel)
    
    f <- function(x) {
      n <- min(length(x[[1]]), length(x[[2]]))
      out <- which.max(x[[1]][1:n] != x[[2]][1:n])
      if (out == 1L && x[[1]][1] == x[[2]][1]) n + 1L else out
    }
    
    cl <- makeCluster(detectCores() - 1) # 15 cores
    clusterExport(cl, c("f"), environment(f))
    
    flcsPar <- function(x) {
      unlist(parLapply(cl, combn(lapply(x, utf8ToInt), 2, NULL, FALSE), f)) - 1L
    }
    
    buildmat(x, flcsPar(x), TRUE)
    #>      [,1]    [,2]    [,3] [,4] 
    #> [1,] ""      "01001" "01" "010"
    #> [2,] "01001" ""      "01" "010"
    #> [3,] "01"    "01"    ""   "01" 
    #> [4,] "010"   "010"   "01" ""
    

    Time the two functions on a vector of 10k binary strings.

    x <- vapply(1:1e4, \(.) intToUtf8(sample(48:49, sample(10:20, 1), 1)), "")
    system.time(n1 <- flcs(x))
    #>    user  system elapsed 
    #>    9.84    3.41   13.26
    system.time(n2 <- flcsPar(x))
    #>    user  system elapsed 
    #>   69.31   54.53  242.07
    identical(n1, n2)
    #> [1] TRUE
    

    flcs performed nearly 50 million comparisons in about 13 seconds.