I am trying to match two lists of strings with names that are written differently and have partial matches:
list1 = {'ADELA SARABIA', 'JUAN PEREZ', 'JOHN ADAMS', 'TOM HANKS'}
list2 = {'JOSE GARCIA', 'HANKS TOM', 'PEREZ LOPEZ JUAN', 'JOHN P. ADAMS'}
I want to keep the names that appear in both lists even though have only partial matches. Desire output:
matches = {'JUAN PEREZ', 'JOHN ADAMS', 'TOM HANKS'}
I was using this code frome another stackoverflow question, but doesnt work with my case:
lst = []
for i in list1:
has_match = False
for j in list2:
if i.split()[0] in j:
has_match = True
print(i, j)
if j not in lst:
lst.append(j)
if len(i) > 1:
k = ' '.join(i.split()[:2])
if k in j:
has_match = True
print(i, j)
if j not in lst:
lst.append(j)
if not has_match:
lst.append(i + ' - not found')
This work exactly as i expected
def calculate_similarity(string1, string2):
words1 = set(string1.split())
words2 = set(string2.split())
common_words = words1 & words2
similarity = len(common_words) / min(len(words1), len(words2))
return similarity
matches = set()
for item1 in list1:
best_similarity = 0
best_match = None
for item2 in list2:
similarity = calculate_similarity(item1, item2)
if similarity > best_similarity:
best_similarity = similarity
best_match = item2
if best_similarity > 0.7: # Adjust the threshold as needed
matches.add(best_match)
print("Matches:", matches)