I am trying to learn about dedupe library . I am trying to match name which are more than 80% match.
Sharing code and error . Please help
import dedupe
from Levenshtein import distance
def test():
# Sample data (replace with your actual library data)
data = [
{'name': 'Alice Smith', 'address': '123 Main St', 'phone': '555-1212'},
{'name': 'Alice SmIth', 'address': '123 Main Street', 'phone': '555-1213'},
{'name': 'Bob Johnson', 'address': '456 Elm St', 'phone': '555-3434'},
{'name': 'Charlie Brown', 'address': '789 Maple Ave', 'phone': '555-5656'},
]
# Define fields for comparison (adjust based on your data)
# Define data fields and comparison functions
fields = [
{'field': 'name', 'comparators': ['name_similarity']},
]
# Define similarity functions - customize based on your matching criteria
def name_similarity(s1, s2):
# Implement your name comparison logic here (e.g., Levenshtein distance, etc.)
distance1 = distance(s1, s2)
similarity = 1 - (distance1 / max(len(s1), len(s2))) # Normalize distance to 0-1 similarity
return similarity
# Set thresholds for field-wise and overall similarity (adjust as needed)
deduper = dedupe.Dedupe(fields)
deduper.threshold( threshold=0.8)
# Process the data for deduplication
deduped_data = deduper.dedupe(data)
# Print the deduplicated results
print("Deduplicated Data:")
for cluster in deduped_data:
print(cluster)
if __name__ == '__main__':
test()
.....
C:\PythonProject\pythonProject\venv\Graph_POC\Scripts\python.exe C:\PythonProject\pythonProject\matching.py Traceback (most recent call last): File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 152, in typify_variables
variable_type = definition["type"]
~~~~~~~~~~^^^^^^^^ KeyError: 'type'
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "C:\PythonProject\pythonProject\matching.py", line 45, in <module>
test() File "C:\PythonProject\pythonProject\matching.py", line 32, in test
deduper = dedupe.Dedupe(fields)
^^^^^^^^^^^^^^^^^^^^^ File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\api.py", line 1155, in __init__
self.data_model = datamodel.DataModel(variable_definition)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 42, in __init__
self.primary_variables, all_variables = typify_variables(variable_definitions)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 161, in typify_variables
raise KeyError( KeyError: "Missing variable type: variable specifications are dictionaries that must include a type definition, ex. {'field' : 'Phone', type: 'String'}"
Process finished with exit code 1
My answer is for dedupe>=2.0
and the code may not work on other library versions.
Also I would suggest to check this manuals, they have a lot of usefull information: https://dedupeio.github.io/dedupe-examples/docs/csv_example.html
Back to your error - when intializing dedupe.Dedupe
class it is expecting from you to pass information about field types (see fields
variable in the code below).
Also your data should be formatted a bit different, each row should have an id. Check variable data_d
in the code below.
One more thing that you are missing - list of labeled examples. Basically several matching pairs and several distinct pairs (better to have at least 10 pairs in each group). They are stores in labeled_example
variable in the code.
Lastly if you plan to use custom comparators - you should change field_type
to Custom
. Otherwise dedupe
will use standard comparator based on field type (String/Category/Float etc.).
check this link for all available variable definitions:
https://docs.dedupe.io/en/latest/Variable-definition.html
import dedupe
from Levenshtein import distance
# Define similarity functions - customize based on your matching criteria
def name_similarity(s1, s2):
# Implement your name comparison logic here (e.g., Levenshtein distance, etc.)
distance1 = distance(s1, s2)
similarity = 1 - (distance1 / max(len(s1), len(s2))) # Normalize distance to 0-1 similarity
return similarity
def test():
# Sample data (replace with your actual library data)
data = [
{'name': 'Alice Smith', 'address': '123 Main St', 'phone': '555-1212'},
{'name': 'Alice SmIth', 'address': '123 Main Street', 'phone': '555-1213'},
{'name': 'Bob Johnson', 'address': '456 Elm St', 'phone': '555-3434'},
{'name': 'Bob Johnson', 'address': '457 Elm St', 'phone': '555-3434'},
{'name': 'Charlie Brown', 'address': '789 Maple Ave', 'phone': '555-5656'},
{'name': 'Charlie Brown', 'address': '789 Meple Ave', 'phone': '555-5656'},
{'name': 'Karry Perry', 'address': '102 Meple Ave', 'phone': '555-3556'},
{'name': 'Karri Perry', 'address': '102 Meple Ave', 'phone': '555-3556'},
]
# give examples of matched Rows and distinct Rows
labeled_examples = {
"match": [(data[0], data[1]),
(data[2], data[3]),
(data[4], data[5])],
"distinct": [(data[0], data[2]),
(data[2], data[4]),
(data[4], data[6])],
}
data_d = {i:record for i, record in enumerate(data)}
# Define fields for comparison (adjust based on your data)
# Define data fields and comparison functions
fields = [
{'field': 'name', 'type': 'Custom', 'comparator': name_similarity},
# {'field': 'address', 'type': 'String'},
{'field': 'phone', 'type': 'String'},
]
deduper = dedupe.Dedupe(fields)
deduper.prepare_training(data_d)
deduper.mark_pairs(labeled_examples)
# # # !! This line required to run this example - very small dataset
# # # !! Remove it if your dataset have more than 10 examples in each class
# # # this is GridSearchCV.cv parameter for creating KFold
deduper.classifier.cv = 2
# # # train
_ = deduper.train()
clustered_dupes = deduper.partition(data_d, threshold=0.5)
print('# duplicate sets', len(clustered_dupes))
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
for record_id, score in zip(records, scores):
cluster_membership[record_id] = {
"Cluster ID": cluster_id,
"confidence_score": score
}
print(cluster_membership)
if __name__ == '__main__':
test()
output:
{0: {'Cluster ID': 0, 'confidence_score': 0.82329434},
1: {'Cluster ID': 0, 'confidence_score': 0.82329434},
2: {'Cluster ID': 1, 'confidence_score': 0.96056044},
3: {'Cluster ID': 1, 'confidence_score': 0.96056044},
4: {'Cluster ID': 2, 'confidence_score': 0.96056044},
5: {'Cluster ID': 2, 'confidence_score': 0.96056044},
6: {'Cluster ID': 3, 'confidence_score': 0.9537174},
7: {'Cluster ID': 3, 'confidence_score': 0.9537174}}