I'm looking for some help with coding a word suggestion system using Python. On given input of a random sequences of characters I want to be able to search through a wordlist and give some recommendations of words.
The closets thing I have found was is a spelling correction system (https://norvig.com/spell-correct.html) when analysing the function "edits1" it does produce some results however this is based on one edit (e.g. including one 'a' to the input string).
What I want to achieve is using more than one letter i.e. vowels or consonants. e.g. given the letters 'prt' the dictionary search should recommend 'part' and 'apart' etc.
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('E:\\new\\words.txt').read())) #wordlist containing numerious word e.g. 'prut', 'prot', 'port', 'part', 'prat', 'pert', 'pret', 'apart'.
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'aeiouxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 2)]
inserts = [L + c + R for L, R in splits for c in letters]
return set(inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
import filler
h = ['prt']
for x in h:
input = filler.candidates(h[0])
print(input)
Well, I have your code modified. The Suggestor
class receive two parms, which are max_times
and letters
, so that you can change them whenever and whatever you want.
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('big.txt').read()))
class Suggestor:
def __init__(self,max_times,letters):
self.max_times = max_times
self.letters = letters
def candidates(self,word):
return self.known(self.edited_word(word))
def known(self,words):
return set(w for w in words if w in WORDS)
def edit(self,word):
letters = self.letters
splits = [(word[:i], word[i:]) for i in range(len(word) + 2)]
inserts = [L + c + R for L, R in splits for c in letters]
return list(set(inserts))
def edited_word(self,raw_word):
words = [[raw_word]]
for i in range(self.max_times):
i_times_words = []
for word in words[-1]:
i_times_words += self.edit(word)
words.append(list(set(i_times_words)))
return [w for word in words for w in word]
if __name__ == '__main__':
word = 'prt'
suggestor = Suggestor(max_times=4,letters='aeiouxyz')
print(suggestor.candidates(word))
And the output of the above test is:
{'partie', 'parity', 'purity', 'part', 'port', 'proto', 'porto', 'party', 'apart', 'parait', 'export', 'operate', 'expert', 'pirate'}
Moreover, my suggestion is checking the probabilities of all the words and you may filter some of them with Bayes' Theorem.