I'm using the following code to detect that two songs (same song, but different versions, e.g. typos, or concert variations) on Spotify
are the same (have similar name). Is there a better (more intelligent) approach than checking if they have common beginning or levenshtein distance
?
When I run that code, I have still duplicates, once in a while and have to delete them manually.
def similarity_old(s1, s2):
if len(s1) != len(s2):
return False
count = 0
for c1, c2 in zip(s1, s2):
if c1 == c2:
count += 1
similarity_percentage = (count / len(s1)) * 100
return similarity_percentage > 70
def similarity_old2(s1, s2):
# implements levenshtein distance
m = len(s1)
n = len(s2)
if abs(m - n) > max(m, n) * 0.9: # Allowing up to 90% length difference
return False
if s1 == s2:
return True
if m == 0 or n == 0:
return False
# Create a matrix to store the edit distances
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Initialize the first row and column of the matrix
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
# Compute the edit distances
for i in range(1, m + 1):
for j in range(1, n + 1):
cost = 0 if s1[i - 1] == s2[j - 1] else 1
dp[i][j] = min(dp[i - 1][j] + 1, # Deletion
dp[i][j - 1] + 1, # Insertion
dp[i - 1][j - 1] + cost) # Substitution
# Calculate the similarity percentage
similarity_percentage = ((max(m, n) - dp[m][n]) / max(m, n)) * 100
return similarity_percentage > 70
def similarity(s1, s2):
# optimized levenshtein distance
len1 = len(s1)
len2 = len(s2)
if abs(len1 - len2) > max(len1, len2) * 0.9: # Allowing up to 90% length difference
return False
if s1 == s2:
return True
if len1 == 0 or len2 == 0:
return False
if len1 > len2:
s1, s2 = s2, s1
len1, len2 = len2, len1
previous_row = list(range(len1 + 1))
for i, c2 in enumerate(s2):
current_row = [i + 1]
for j, c1 in enumerate(s1):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
similarity_percentage = ((len1 - previous_row[-1]) / len1) * 100
return similarity_percentage > 70
def common_beginning(s1, s2):
if len(s1) > 11 and s2.startswith(s1[:11]):
return True
else:
return False
def check_similarity(s, list1):
# s = song name
# list1 = list of song names
# returns True if s is similar to something in list1, else False
for item in list1:
if common_beginning(s, item):
return True
#for item in list1:
# if similarity(s, item):
# return True
return False
I would tokenize the name, make all lowercase, filter out some words like "live, feat. ..." and then compute the difference so:
IRRELEVANT_TOKENS = ["live", "feat.", "remix", "remastered"]
a1 = text1.lower()
b1 = text2.lower()
## redundant strings
a2 = a1
b2 = b1
for rm in IRRELEVANT_TOKENS:
a2 = a2.replace(rm, "")
b2 = b2.replace(rm, "")
a3 = a2.split(" ")
b3 = b2.split(" ")
if (len(set(a3) - set(b3)) == 0 or
len(set(b3) - set(a3)) == 0):
print("One contains the other, likely to be the same song")
else:
print("Largely different, not likely to be the same song")
Code not tested, how do you make it take control of your Spotify?
PS: difflib might be a simple solution too. But all solutions will either have false positives, false negatives or both, this is very difficult to automate!