pythonlistpartial-matches

How to extract common elements over two lists of strings - python


I am trying to match two lists of strings with names that are written differently and have partial matches:

list1 = {'ADELA SARABIA', 'JUAN PEREZ', 'JOHN ADAMS', 'TOM HANKS'}
list2 = {'JOSE GARCIA', 'HANKS TOM', 'PEREZ LOPEZ JUAN', 'JOHN P. ADAMS'}

I want to keep the names that appear in both lists even though have only partial matches. Desire output:

matches = {'JUAN PEREZ', 'JOHN ADAMS', 'TOM HANKS'}

I was using this code frome another stackoverflow question, but doesnt work with my case:

lst = []
for i in list1:
    has_match = False
    for j in list2:
        if i.split()[0] in j:
            has_match = True
            print(i, j)
            if j not in lst:
                lst.append(j)
        if len(i) > 1:
            k = ' '.join(i.split()[:2])
            if k in j:
                has_match = True
                print(i, j)
                if j not in lst:
                    lst.append(j)
    if not has_match:
        lst.append(i + ' - not found')

Solution

  • This work exactly as i expected

    def calculate_similarity(string1, string2):
    words1 = set(string1.split())
    words2 = set(string2.split())
    common_words = words1 & words2
    similarity = len(common_words) / min(len(words1), len(words2))
    return similarity
    
    matches = set()
    
    for item1 in list1:
        best_similarity = 0
        best_match = None
    
    for item2 in list2:
        similarity = calculate_similarity(item1, item2)
        if similarity > best_similarity:
            best_similarity = similarity
            best_match = item2
    
    if best_similarity > 0.7:  # Adjust the threshold as needed
        matches.add(best_match)
    
    print("Matches:", matches)