pythonpandasdataframespellingmultiple-resultsets

Multiple Spelling Results in a Dataframe


I have some data containing spelling errors. I'm correcting them and scoring how close the spelling is using the following code:

import pandas as pd
import difflib

Li_A = ["potato", "tomato", "squash", "apple", "pear"]

B    = {'one' : pd.Series(["potat0", "toma3o", "s5uash", "ap8le", "pea7"], index=['a', 'b', 'c', 'd', 'e']),
        'two' : pd.Series(["po1ato", "2omato", "squ0sh", "2pple", "p3ar"], index=['a', 'b', 'c', 'd', 'e'])}

df_B = pd.DataFrame(B)

# Define the function that corrects the spelling:
def Spelling(ask):
    return difflib.get_close_matches(ask, Li_A, n=3, cutoff=0.5)[0]

df_B['Correct one'] = df_B['one'].apply(Spelling)

# Define the function that Scores   the spelling:
def Spell_Score(row):
    return difflib.SequenceMatcher(None, row['one'], row['Correct one']).ratio()

df_B['Score'] = df_B.apply(Spell_Score, axis=1)

This outputs the correct spelling and the score:

df_B
       one     two Correct one     Score
 a  potat0  po1ato      potato  0.833333
 b  toma3o  2omato      tomato  0.833333
 c  s5uash  squ0sh      squash  0.833333
 d   ap8le   2pple       apple  0.800000
 e    pea7    p3ar        pear  0.750000

How can I add columns to give the second and third highest scoring results and their scores please?


Solution

  • Simplier is use only one function and return all values converterted to Series and last join to original DataFrame:

    def Spelling(ask):
        a = difflib.get_close_matches(ask, Li_A, n=3, cutoff=0.5)
        #list comprehension for all values of a
        b = [difflib.SequenceMatcher(None, ask, x).ratio() for x in a]
        return pd.Series(a + b)
    
    df = df_B['one'].apply(Spelling)
    
    #get correct columns names
    a = len(df.columns) // 2
    df.columns = ['Spelling_{}'.format(x) for x in range(a)] + \
                 ['Spell_Score{}'.format(y) for y in range(a)]
    df1 = df_B.join(df)
    print (df1)
          one     two Spelling_0 Spelling_1  Spell_Score0  Spell_Score1
    a  potat0  po1ato     potato     tomato      0.833333           0.5
    b  toma3o  2omato     tomato   0.833333           NaN           NaN
    c  s5uash  squ0sh     squash   0.833333           NaN           NaN
    d   ap8le   2pple      apple        0.8           NaN           NaN
    e    pea7    p3ar       pear       0.75           NaN           NaN