pythonregexpandasstring-matchingmatch-phrase

Need help in matching strings from phrases from multiple columns of a dataframe in python


Need help in matching phrases in the data given below where I need to match phrases from both TextA and TextB.

The following code did not helped me in doing it how can I address this I had 100s of them to match

#sorting jumbled phrases

def sorts(string_value):
    sorted_string = sorted(string_value.split())
    sorted_string = ' '.join(sorted_string)
    return sorted_string

#Removing punctuations in string

punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

def punt(test_str):
    for ele in test_str:
        if ele in punc:
            test_str = test_str.replace(ele, "")
    return(test_str)

#matching strings

def lets_match(x):

    for text1 in TextA:
        for text2 in TextB:
            try:
                if sorts(punt(x[text1.casefold()])) == sorts(punt(x[text2.casefold()])):
                    return True
            except:
                continue
    return False
df['result'] = df.apply(lets_match,axis =1)

even after implementing string sort, removing punctuations and case sensitivity I am still getting those strings as not matching. I am I missing something here can some help me in achieving it


Solution

  • Is there any issues with using the fuzzy match lib? The implementation is pretty straight forward and works well given the above data is relatively similar. I've performed the below without preprocessing.

        import pandas as pd
        """ Install the libs below via terminal:
    
                $pip install fuzzywuzzy
                $pip install python-Levenshtein
        """
    
        from fuzzywuzzy import fuzz
        from fuzzywuzzy import process
    
    
        #creating the data frames
            text_a = ['AKIL KUMAR SINGH','OUSMANI DJIBO','PETER HRYB','CNOC LIMITED','POLY NOVA INDUSTRIES LTD','SAM GAWED JR','ADAN GENERAL LLC','CHINA MOBLE LIMITED','CASTAR CO., LTD.','MURAN','OLD SAROOP FOR CAR SEAT COVERS','CNP HEALTHCARE, LLC','GLORY PACK LTD','AUNCO VENTURES','INTERNATIONAL COMPANY','SAMEERA HEAT AND ENERGY FUND']
            text_b = ['Singh, Akil Kumar','DJIBO, Ousmani Illiassou','HRYB, Peter','CNOOC LIMITED','POLYNOVA INDUSTRIES LTD.','GAWED, SAM','ADAN GENERAL TRADING FZE','CHINA MOBILE LIMITED','CASTAR GROUP CO., LTD.','MURMAN','Old Saroop for Car Seat Covers','CNP HEATHCARE, LLC','GLORY PACK LTD.','AUNCO VENTURE','INTL COMPANY','SAMEERA HEAT AND ENERGY PROPERTY FUND']
            df_text_a = pd.DataFrame(text_a, columns=['text_a'])
            df_text_b = pd.DataFrame(text_b, columns=['text_b'])
            
            def lets_match(txt: str, chklist: list) -> str: 
                return process.extractOne(txt, chklist, scorer=fuzz.token_set_ratio)
    
    
        #match Text_A against Text_B
            result_txt_ab = df_text_a.apply(lambda x: lets_match(str(x), text_b), axis=1, result_type='expand')
            result_txt_ab.rename(columns={0:'Return Match', 1:'Match Value'}, inplace=True)
            df_text_a[result_txt_ab.columns]=result_txt_ab
            df_text_a
    

        text_a  Return Match    Match Value
        0   AKIL KUMAR SINGH    Singh, Akil Kumar   100
        1   OUSMANI DJIBO   DJIBO, Ousmani Illiassou    72
        2   PETER HRYB  HRYB, Peter 100
        3   CNOC LIMITED    CNOOC LIMITED   70
        4   POLY NOVA INDUSTRIES LTD    POLYNOVA INDUSTRIES LTD.    76
        5   SAM GAWED JR    GAWED, SAM  100
        6   ADAN GENERAL LLC    ADAN GENERAL TRADING FZE    67
        7   CHINA MOBLE LIMITED CHINA MOBILE LIMITED    79
        8   CASTAR CO., LTD.    CASTAR GROUP CO., LTD.  81
        9   MURAN   SAMEERA HEAT AND ENERGY PROPERTY FUND   41
        10  OLD SAROOP FOR CAR SEAT COVERS  Old Saroop for Car Seat Covers  100
        11  CNP HEALTHCARE, LLC CNP HEATHCARE, LLC  58
        12  GLORY PACK LTD  GLORY PACK LTD. 100
        13  AUNCO VENTURES  AUNCO VENTURE   56
        14  INTERNATIONAL COMPANY   INTL COMPANY    74
        15  SAMEERA HEAT AND ENERGY FUND    SAMEERA HEAT AND ENERGY PROPERTY FUND   86
    

        #match Text_B against Text_A
        result_txt_ba= df_text_b.apply(lambda x: lets_match(str(x), text_a), axis=1, result_type='expand')
        result_txt_ba.rename(columns={0:'Return Match', 1:'Match Value'}, inplace=True)
        df_text_b[result_txt_ba.columns]=result_txt_ba
        df_text_b
    

    text_b  Return Match    Match Value
    0   Singh, Akil Kumar   AKIL KUMAR SINGH    100
    1   DJIBO, Ousmani Illiassou    OUSMANI DJIBO   100
    2   HRYB, Peter PETER HRYB  100
    3   CNOOC LIMITED   CNOC LIMITED    74
    4   POLYNOVA INDUSTRIES LTD.    POLY NOVA INDUSTRIES LTD    74
    5   GAWED, SAM  SAM GAWED JR    86
    6   ADAN GENERAL TRADING FZE    ADAN GENERAL LLC    86
    7   CHINA MOBILE LIMITED    CHINA MOBLE LIMITED 81
    8   CASTAR GROUP CO., LTD.  CASTAR CO., LTD.    100
    9   MURMAN  ADAN GENERAL LLC    33
    10  Old Saroop for Car Seat Covers  OLD SAROOP FOR CAR SEAT COVERS  100
    11  CNP HEATHCARE, LLC  CNP HEALTHCARE, LLC 56
    12  GLORY PACK LTD. GLORY PACK LTD  100
    13  AUNCO VENTURE   AUNCO VENTURES  53
    14  INTL COMPANY    INTERNATIONAL COMPANY   50
    15  SAMEERA HEAT AND ENERGY PROPERTY FUND   SAMEERA HEAT AND ENERGY FUND    100