pythonalgorithmlevenshtein-distance

How to detect that two songs on Spotify are the same (have similar name)


I'm using the following code to detect that two songs (same song, but different versions, e.g. typos, or concert variations) on Spotify are the same (have similar name). Is there a better (more intelligent) approach than checking if they have common beginning or levenshtein distance? When I run that code, I have still duplicates, once in a while and have to delete them manually.

def similarity_old(s1, s2):
    if len(s1) != len(s2):
        return False

    count = 0
    for c1, c2 in zip(s1, s2):
        if c1 == c2:
            count += 1

    similarity_percentage = (count / len(s1)) * 100
    return similarity_percentage > 70


def similarity_old2(s1, s2):
    # implements levenshtein distance
    m = len(s1)
    n = len(s2)
    if abs(m - n) > max(m, n) * 0.9:  # Allowing up to 90% length difference
        return False

    if s1 == s2:
        return True

    if m == 0 or n == 0:
        return False

    # Create a matrix to store the edit distances
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column of the matrix
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    # Compute the edit distances
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j] + 1,  # Deletion
                           dp[i][j - 1] + 1,  # Insertion
                           dp[i - 1][j - 1] + cost)  # Substitution

    # Calculate the similarity percentage
    similarity_percentage = ((max(m, n) - dp[m][n]) / max(m, n)) * 100

    return similarity_percentage > 70

def similarity(s1, s2):
    # optimized levenshtein distance
    len1 = len(s1)
    len2 = len(s2)
    if abs(len1 - len2) > max(len1, len2) * 0.9:  # Allowing up to 90% length difference
        return False

    if s1 == s2:
        return True

    if len1 == 0 or len2 == 0:
        return False

    if len1 > len2:
        s1, s2 = s2, s1
        len1, len2 = len2, len1

    previous_row = list(range(len1 + 1))

    for i, c2 in enumerate(s2):
        current_row = [i + 1]
        for j, c1 in enumerate(s1):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    similarity_percentage = ((len1 - previous_row[-1]) / len1) * 100

    return similarity_percentage > 70


def common_beginning(s1, s2):
    if len(s1) > 11 and s2.startswith(s1[:11]):
        return True
    else:
        return False


def check_similarity(s, list1):
    # s = song name
    # list1 = list of song names
    # returns True if s is similar to something in list1, else False

    for item in list1:
        if common_beginning(s, item):
            return True
    #for item in list1:
    #    if similarity(s, item):
    #        return True
    return False

Solution

  • I would tokenize the name, make all lowercase, filter out some words like "live, feat. ..." and then compute the difference so:

    IRRELEVANT_TOKENS = ["live", "feat.", "remix", "remastered"]
    
    a1 = text1.lower()
    b1 = text2.lower()
    
    ## redundant strings
    a2 = a1
    b2 = b1
    for rm in IRRELEVANT_TOKENS:
        a2 = a2.replace(rm, "")
        b2 = b2.replace(rm, "")
    
    a3 = a2.split(" ")
    b3 = b2.split(" ")
    
    
    if (len(set(a3) - set(b3)) == 0 or
        len(set(b3) - set(a3)) == 0):
        print("One contains the other, likely to be the same song")
    else:
        print("Largely different, not likely to be the same song")
    
    

    Code not tested, how do you make it take control of your Spotify?

    PS: difflib might be a simple solution too. But all solutions will either have false positives, false negatives or both, this is very difficult to automate!