pythonalgorithmdamerau-levenshtein

Modify Damerau-Levenshtein algorithm to track transformations (insertions, deletions, etc)


I'm wondering how to modify the Damerau-Levenshtein algorithm to track the specific character transformations required to change a source string to a target string. This question has been answered for the Levenshtein distance, but I couldn't find any answers for DL distance.

I looked at the py-Levenshtein module: it provides exactly what I need, but for Levenshtein distance:

Levenshtein.editops("FBBDE", "BCDASD")
[('delete', 0, 0), ('replace', 2, 1), ('insert', 4, 3), ('insert', 4, 
4), ('replace', 4, 5)]

The code for editops was difficult to decipher since it's written in C. I wonder how tracking transformations the can be done efficiently: I imagine it is possible from the distance matrix, which looks something like:

      r  e  p  u  b  l  i  c  a  n
   0  1  2  3  4  5  6  7  8  9  10
d  1  1  2  3  4  5  6  7  8  9  10
e  2  2  1  2  3  4  5  6  7  8  9
m  3  3  2  2  3  4  5  6  7  8  9
o  4  4  3  3  3  4  5  6  7  8  9
c  5  5  4  4  4  4  5  6  6  7  8
r  6  5  5  5  5  5  5  6  7  7  8
a  7  6  6  6  6  6  6  6  7  7  8
t  8  7  7  7  7  7  7  7  7  8  8

Solution

  • import numpy as np
    
    def levenshtein_distance(string1, string2):
        n1 = len(string1)
        n2 = len(string2)
        return _levenshtein_distance_matrix(string1, string2)[n1, n2]
    
    def damerau_levenshtein_distance(string1, string2):
        n1 = len(string1)
        n2 = len(string2)
        return _levenshtein_distance_matrix(string1, string2, True)[n1, n2]
    
    def get_ops(string1, string2, is_damerau=False):
        i, j = _levenshtein_distance_matrix(string1, string2, is_damerau).shape
        i -= 1
        j -= 1
        ops = list()
        while i != -1 and j != -1:
            if is_damerau:
                if i > 1 and j > 1 and string1[i-1] == string2[j-2] and string1[i-2] == string2[j-1]:
                    if dist_matrix[i-2, j-2] < dist_matrix[i, j]:
                        ops.insert(0, ('transpose', i - 1, i - 2))
                        i -= 2
                        j -= 2
                        continue
            index = np.argmin([dist_matrix[i-1, j-1], dist_matrix[i, j-1], dist_matrix[i-1, j]])
            if index == 0:
                if dist_matrix[i, j] > dist_matrix[i-1, j-1]:
                    ops.insert(0, ('replace', i - 1, j - 1))
                i -= 1
                j -= 1
            elif index == 1:
                ops.insert(0, ('insert', i - 1, j - 1))
                j -= 1
            elif index == 2:
                ops.insert(0, ('delete', i - 1, i - 1))
                i -= 1
        return ops
    
    def execute_ops(ops, string1, string2):
        strings = [string1]
        string = list(string1)
        shift = 0
        for op in ops:
            i, j = op[1], op[2]
            if op[0] == 'delete':
                del string[i + shift]
                shift -= 1
            elif op[0] == 'insert':
                string.insert(i + shift + 1, string2[j])
                shift += 1
            elif op[0] == 'replace':
                string[i + shift] = string2[j]
            elif op[0] == 'transpose':
                string[i + shift], string[j + shift] = string[j + shift], string[i + shift]
            strings.append(''.join(string))
        return strings
    
    def _levenshtein_distance_matrix(string1, string2, is_damerau=False):
        n1 = len(string1)
        n2 = len(string2)
        d = np.zeros((n1 + 1, n2 + 1), dtype=int)
        for i in range(n1 + 1):
            d[i, 0] = i
        for j in range(n2 + 1):
            d[0, j] = j
        for i in range(n1):
            for j in range(n2):
                if string1[i] == string2[j]:
                    cost = 0
                else:
                    cost = 1
                d[i+1, j+1] = min(d[i, j+1] + 1, # insert
                                  d[i+1, j] + 1, # delete
                                  d[i, j] + cost) # replace
                if is_damerau:
                    if i > 0 and j > 0 and string1[i] == string2[j-1] and string1[i-1] == string2[j]:
                        d[i+1, j+1] = min(d[i+1, j+1], d[i-1, j-1] + cost) # transpose
        return d
    
    if __name__ == "__main__":
        # GIFTS PROFIT
        # FBBDE BCDASD
        # SPARTAN PART
        # PLASMA ALTRUISM
        # REPUBLICAN DEMOCRAT
        # PLASMA PLASMA
        # FISH IFSH
        # STAES STATES
        string1 = 'FISH'
        string2 = 'IFSH'
        for is_damerau in [True, False]:
            if is_damerau:
                print('=== damerau_levenshtein_distance ===')
            else:
                print('=== levenshtein_distance ===')
            dist_matrix = _levenshtein_distance_matrix(string1, string2, is_damerau=is_damerau)
            print(dist_matrix)
            ops = get_ops(string1, string2, is_damerau=is_damerau)
            print(ops)
            res = execute_ops(ops, string1, string2)
            print(res)
    

    Output:

    === damerau_levenshtein_distance ===
    [[0 1 2 3 4]
     [1 1 1 2 3]
     [2 1 1 2 3]
     [3 2 2 1 2]
     [4 3 3 2 1]]
    [('transpose', 1, 0)]
    ['FISH', 'IFSH']
    === levenshtein_distance ===
    [[0 1 2 3 4]
     [1 1 1 2 3]
     [2 1 2 2 3]
     [3 2 2 2 3]
     [4 3 3 3 2]]
    [('replace', 0, 0), ('replace', 1, 1)]
    ['FISH', 'IISH', 'IFSH']