pythonpython-3.xpython-dedupe

Python Code failing : dedupe library error


I am trying to learn about dedupe library . I am trying to match name which are more than 80% match.

Sharing code and error . Please help

import dedupe
from Levenshtein import distance

def test():


    # Sample data (replace with your actual library data)
    data = [
        {'name': 'Alice Smith', 'address': '123 Main St', 'phone': '555-1212'},
        {'name': 'Alice SmIth', 'address': '123 Main Street', 'phone': '555-1213'},
        {'name': 'Bob Johnson', 'address': '456 Elm St', 'phone': '555-3434'},
        {'name': 'Charlie Brown', 'address': '789 Maple Ave', 'phone': '555-5656'},
    ]

    # Define fields for comparison (adjust based on your data)
    # Define data fields and comparison functions
    fields = [
        {'field': 'name', 'comparators': ['name_similarity']},

    ]

    # Define similarity functions - customize based on your matching criteria
    def name_similarity(s1, s2):
        # Implement your name comparison logic here (e.g., Levenshtein distance, etc.)
        distance1 = distance(s1, s2)
        similarity = 1 - (distance1 / max(len(s1), len(s2)))  # Normalize distance to 0-1 similarity
        return similarity



    # Set thresholds for field-wise and overall similarity (adjust as needed)
    deduper = dedupe.Dedupe(fields)
    deduper.threshold( threshold=0.8)

    # Process the data for deduplication
    deduped_data = deduper.dedupe(data)

    # Print the deduplicated results
    print("Deduplicated Data:")
    for cluster in deduped_data:
        print(cluster)


if __name__ == '__main__':
    test()

.....

C:\PythonProject\pythonProject\venv\Graph_POC\Scripts\python.exe C:\PythonProject\pythonProject\matching.py  Traceback (most recent call last):   File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 152, in typify_variables
    variable_type = definition["type"]
                    ~~~~~~~~~~^^^^^^^^ KeyError: 'type'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):   File "C:\PythonProject\pythonProject\matching.py", line 45, in <module>
    test()   File "C:\PythonProject\pythonProject\matching.py", line 32, in test
    deduper = dedupe.Dedupe(fields)
              ^^^^^^^^^^^^^^^^^^^^^   File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\api.py", line 1155, in __init__
    self.data_model = datamodel.DataModel(variable_definition)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^   File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 42, in __init__
    self.primary_variables, all_variables = typify_variables(variable_definitions)
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^   File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 161, in typify_variables
    raise KeyError( KeyError: "Missing variable type: variable specifications are dictionaries that must include a type definition, ex. {'field' : 'Phone', type: 'String'}"

Process finished with exit code 1

Solution

  • My answer is for dedupe>=2.0 and the code may not work on other library versions.

    Also I would suggest to check this manuals, they have a lot of usefull information: https://dedupeio.github.io/dedupe-examples/docs/csv_example.html

    1. Back to your error - when intializing dedupe.Dedupe class it is expecting from you to pass information about field types (see fields variable in the code below).

    2. Also your data should be formatted a bit different, each row should have an id. Check variable data_d in the code below.

    3. One more thing that you are missing - list of labeled examples. Basically several matching pairs and several distinct pairs (better to have at least 10 pairs in each group). They are stores in labeled_example variable in the code.

    4. Lastly if you plan to use custom comparators - you should change field_type to Custom. Otherwise dedupe will use standard comparator based on field type (String/Category/Float etc.).

    check this link for all available variable definitions:

    https://docs.dedupe.io/en/latest/Variable-definition.html

    import dedupe
    from Levenshtein import distance
    
    # Define similarity functions - customize based on your matching criteria
    def name_similarity(s1, s2):
        # Implement your name comparison logic here (e.g., Levenshtein distance, etc.)
        distance1 = distance(s1, s2)
        similarity = 1 - (distance1 / max(len(s1), len(s2)))  # Normalize distance to 0-1 similarity
        return similarity
    
    def test():
        
        # Sample data (replace with your actual library data)
        data = [
            {'name': 'Alice Smith', 'address': '123 Main St', 'phone': '555-1212'},
            {'name': 'Alice SmIth', 'address': '123 Main Street', 'phone': '555-1213'},
            
            {'name': 'Bob Johnson', 'address': '456 Elm St', 'phone': '555-3434'},
            {'name': 'Bob Johnson', 'address': '457 Elm St', 'phone': '555-3434'},
            
            {'name': 'Charlie Brown', 'address': '789 Maple Ave', 'phone': '555-5656'},
            {'name': 'Charlie Brown', 'address': '789 Meple Ave', 'phone': '555-5656'},
        
            {'name': 'Karry Perry', 'address': '102 Meple Ave', 'phone': '555-3556'},
            {'name': 'Karri Perry', 'address': '102 Meple Ave', 'phone': '555-3556'},
        
        ]
    
        # give examples of matched Rows and distinct Rows
        labeled_examples = {
            "match": [(data[0], data[1]),
                      (data[2], data[3]),
                      (data[4], data[5])],
            "distinct": [(data[0], data[2]),
                      (data[2], data[4]),
                      (data[4], data[6])],
        }
        
        data_d = {i:record for i, record in enumerate(data)}
        
        
        # Define fields for comparison (adjust based on your data)
        # Define data fields and comparison functions
        fields = [
            {'field': 'name', 'type': 'Custom', 'comparator': name_similarity},
            # {'field': 'address', 'type': 'String'},
            {'field': 'phone', 'type': 'String'},
        
        ]
            
        deduper = dedupe.Dedupe(fields)
        deduper.prepare_training(data_d)    
        deduper.mark_pairs(labeled_examples)
    
    
        # # # !! This line required to run this example - very small dataset
        # # # !! Remove it if your dataset have more than 10 examples in each class
        # # # this is GridSearchCV.cv parameter for creating KFold
        deduper.classifier.cv = 2
    
        # # # train
        _ = deduper.train()
    
        clustered_dupes = deduper.partition(data_d, threshold=0.5)
        
        print('# duplicate sets', len(clustered_dupes))
        
        cluster_membership = {}
        for cluster_id, (records, scores) in enumerate(clustered_dupes):
            for record_id, score in zip(records, scores):
                cluster_membership[record_id] = {
                    "Cluster ID": cluster_id,
                    "confidence_score": score
                }
        
        print(cluster_membership)
    
    
    
    if __name__ == '__main__':
        test()
    

    output:

    {0: {'Cluster ID': 0, 'confidence_score': 0.82329434}, 
     1: {'Cluster ID': 0, 'confidence_score': 0.82329434}, 
     2: {'Cluster ID': 1, 'confidence_score': 0.96056044}, 
     3: {'Cluster ID': 1, 'confidence_score': 0.96056044}, 
     4: {'Cluster ID': 2, 'confidence_score': 0.96056044}, 
     5: {'Cluster ID': 2, 'confidence_score': 0.96056044}, 
     6: {'Cluster ID': 3, 'confidence_score': 0.9537174}, 
     7: {'Cluster ID': 3, 'confidence_score': 0.9537174}}