pythonarrayslistsetfuzzy-comparison

want to add left out string in matched string


Below is my example code:

from fuzzywuzzy import fuzz
import json
from itertools import zip_longest

synonyms = open("synonyms.json","r")
synonyms = json.loads(synonyms.read())

vendor_data = ["i7 processor","solid state","Corei5 :1135G7 (11th 
                       Generation)","hard 
                      drive","ddr 8gb","something1", "something2",
                      "something3","HT (100W) DDR4-2400"]

buyer_data = ["i7 processor 12 generation","corei7:latest technology"]
vendor = []
buyer = []
for item,value in synonyms.items():
    for k,k2 in zip_longest(vendor_data,buyer_data):
        for v in value:
            if fuzz.token_set_ratio(k,v) > 70:
                if item in k:
                    vendor.append(k)
                else:
                    vendor.append(item+" "+k)
            else:
                #didnt get only "something" strings here !

            if fuzz.token_set_ratio(k2,v) > 70:
                if item in k2:
                    buyer.append(k2)
                else:
                    buyer.append(item+" "+k2)

vendor = list(set(vendor))
buyer = list(set(buyer))
vendor,buyer

Note: "something" string can be anything like "battery" or "display"etc

synonyms json

{
"processor":["corei5","core","corei7","i5","i7","ryzen5","i5 processor","i7 
           processor","processor i5","processor i7","core generation","core gen"],

"ram":["DDR4","memory","DDR3","DDR","DDR 8gb","DDR 8 gb","DDR 16gb","DDR 16 gb","DDR 
                                                          32gb","DDR 32 gb","DDR4-"],

"ssd":["solid state drive","solid drive"],

"hdd":["Hard Drive"]

 }

what do i need ?

I want to add all "something" string inside vendor list dynamically.

! NOTE -- "something" string can be anything in future.

I want to add "something" string in vendor array which is not a matched value in fuzz>70! I want to basically add left out data also.

for example like below:

current output

['processor Corei5 :1135G7 (11th Generation)',
 'i7 processor',
 'ram HT (100W) DDR4-2400',
 'ram ddr 8gb',
 'hdd hard drive',
 'ssd solid state']

expected output below

 ['processor Corei5 :1135G7 (11th Generation)',
 'i7 processor',
 'ram HT (100W) DDR4-2400',
 'ram ddr 8gb',
 'hdd hard drive',
 'ssd solid state',
 'something1',
 'something2'
 'something3']  #something string need to be added in vendor list dynamically.

what silly mistake am I doing ? Thank you.


Solution

  • Here's my attempt:

    from fuzzywuzzy import process, fuzz
    
    synonyms = {'processor': ['corei5', 'core', 'corei7', 'i5', 'i7', 'ryzen5', 'i5 processor', 'i7 processor', 'processor i5', 'processor i7', 'core generation', 'core gen'], 'ram': ['DDR4', 'memory', 'DDR3', 'DDR', 'DDR 8gb', 'DDR 8 gb', 'DDR 16gb', 'DDR 16 gb', 'DDR 32gb', 'DDR 32 gb', 'DDR4-'], 'ssd': ['solid state drive', 'solid drive'], 'hdd': ['Hard Drive']}
    vendor_data = ['i7 processor', 'solid state', 'Corei5 :1135G7 (11th Generation)', 'hard drive', 'ddr 8gb', 'something1', 'something2', 'something3', 'HT (100W) DDR4-2400']
    buyer_data = ['i7 processor 12 generation', 'corei7:latest technology']
    
    def find_synonym(s: str, min_score: int = 60):
        results = process.extractBests(s, choices=synonyms, score_cutoff=min_score)
        if not results:
            return None
        return results[0][-1]
    
    def process_data(l: list, min_score: int = 60):
        matches = []
        no_matches = []
        for item in l:
            syn = find_synonym(item, min_score=min_score)
            if syn is not None:
                new_item = f'{syn} {item}' if syn not in item else item
                matches.append(new_item)
            elif any(fuzz.partial_ratio(s, item) >= min_score for s in synonyms.keys()):
                # one of the synonyms is already in the item string
                matches.append(item)
            else:
                no_matches.append(item)
        return matches, no_matches
    

    For process_data(vendor_data) we get:

    (['i7 processor',
      'ssd solid state',
      'processor Corei5 :1135G7 (11th Generation)',
      'hdd hard drive',
      'ram ddr 8gb',
      'ram HT (100W) DDR4-2400'],
     ['something1', 'something2', 'something3'])
    

    And for process_data(buyer_data):

    (['i7 processor 12 generation', 'processor corei7:latest technology'], [])
    

    I had to lower the cut-off score to 60 to also get results for ddr 8gb. The process_data function returns 2 lists: One with matches with words from the synonyms dict and one with items without matches. If you want exactly the output you listed in your question, just concatenate the two lists like this:

    matches, no_matches = process_data(vendor_data)
    matches + no_matches  # ['i7 processor', 'ssd solid state', 'processor Corei5 :1135G7 (11th Generation)', 'hdd hard drive', 'ram ddr 8gb', 'ram HT (100W) DDR4-2400', 'something1', 'something2', 'something3']