I am trying to implement Minimum Edit Distance with the substitution cost of 2. Following is the code I've so far. It works well for strings of equal length but generates error for the unequal strings. Kindly correct me where i am wrong
def med(source, target):
# if len(x) > len(y):
# print("insode if")
# source, target = y, x
print(len(source), len(target))
cost = [[0 for inner in range(len(source)+1)] for outer in
range(len(target)+1)]
global backtrace
backtrace = [[0 for inner in range(len(source)+1)] for outer in
range(len(target)+1)]
global SUB
global INS
global DEL
for i in range(0,len(target)+1):
cost[i][0] = i
for j in range(0,len(source)+1):
cost[0][j] = j
for i in range(1,len(target)+1):
for j in range(1,len(source)+1):
if source[i-1]==target[j-1]:
cost[i][j] = cost[i-1][j-1]
else:
deletion = cost[i-1][j]+1
insertion = cost[i][j-1]+1
substitution = cost[i-1][j-1]+2
cost[i][j] = min(insertion,deletion,substitution)
if cost[i][j] == substitution:
backtrace[i][j] = SUB
elif cost[i][j] == insertion:
backtrace[i][j] = INS
else:
backtrace[i][j] = DEL
return cost[i][j]
med("levenshtein","levels")
The error i get is:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-26-86bf20ea27c7> in <module>()
49 return cost[i][j]
50
---> 51 med("levenshtein","levels")
<ipython-input-26-86bf20ea27c7> in med(source, target)
31 for i in range(1,len(target)+1):
32 for j in range(1,len(source)+1):
---> 33 if source[i-1]==target[j-1]:
34 cost[i][j] = cost[i-1][j-1]
35 else:
IndexError: string index out of range
For different length strings, cost
and backtrace
indices doesn't match.
Can be implemented minimum edit distance with 2 substitution cost by updating only one numpy m
* n
arr with cost at each step.
As per Algorithm, Below code will do the job.
def minimumEditDistance(first, second):
#Creating numpy ndarray( initialized with 0 of dimension of size of both strings
matrix = np.zeros((len(first)+1,len(second)+1), dtype=np.int)
# Cross relation loop through each character of each string with each other and
# fill the respective index of matrxi (row,column)
for i in range(len(first)+1):
for j in range(len(second)+1):
#First doing the boundary value analysis, if first or second string is empty so directly adding insertion cost
if i == 0:
matrix[i][j] = j
#Second case
elif j == 0:
matrix[i][j] = i
else:
matrix[i][j] = min(matrix[i][j-1] + 1,
matrix[i-1][j] + 1,
matrix[i-1][j-1] + 2 if first[i-1] != second[j-1] else matrix[i-1][j-1] + 0)
# Adjusted the cost accordinly, insertion = 1, deletion=1 and substitution=2
return matrix[len(first)][len(second)] # Returning the final
>>>print(minimumEditDistance('levenshtein','levels'))
7
>>>print(minimumEditDistance('levenshtein','levenshtein'))
0