I'm trying to optimize this solution for a function that accepts 2 arguments: fullstring
and substring
. The function will return True
if the substring exists in the fullstring, and False
if it does not. There is one special wildcard that could be entered in the substring that denotes 0 or 1 of the previous symbol, and there can be more than one wildcard in the substring.
For example, "a*"
means ""
or "a"
The solution I have works fine but I'm trying to reduce the number of for loops (3) and optimize for time complexity. Using regex is not permitted. Is there a more pythonic way to do this?
Current Solution:
def complex_search(fullstring, substring):
patterns = []
if "*" in substring:
index = substring.index("*")
patterns.append(substring[:index-1] + substring[index+1:])
patterns.append(substring[:index] + substring[index+1:])
else:
patterns.append(substring)
def check(s1, s2):
for a, b in zip(s1, s2):
if a != b:
return False
return True
for pattern in patterns:
for i in range(len(fullstring) - len(pattern) + 1):
if check(fullstring[i:i+len(pattern)], pattern):
return True
return False
>> print(complex_search("dogandcats", "dogs*andcats"))
>> True
Approach
Create all alternatives for the substring based upon '" in substring (can have zero or more '' in substring)
See Function combs(...) below
Use Aho-Corasick to check if one of the substring patterns is in the string. Aho-Corasick is a very efficient algorithm for checking if one or more substrings appear in a string and formed as the basis of the original Unix command fgrep.
For illustrative purposes a Python version of Aho-Corasik is used below, but a C implementation (with Python wrapper) is available at pyahocorasick for higher performance.
See class Aho-Corasick below.
Code
# Note: This is a modification of code explained in https://carshen.github.io/data-structures/algorithms/2014/04/07/aho-corasick-implementation-in-python.html
from collections import deque
class Aho_Corasick():
def __init__(self, keywords):
self.adj_list = []
# creates a trie of keywords, then sets fail transitions
self.create_empty_trie()
self.add_keywords(keywords)
self.set_fail_transitions()
def create_empty_trie(self):
""" initalize the root of the trie """
self.adj_list.append({'value':'', 'next_states':[],'fail_state':0,'output':[]})
def add_keywords(self, keywords):
""" add all keywords in list of keywords """
for keyword in keywords:
self.add_keyword(keyword)
def find_next_state(self, current_state, value):
for node in self.adj_list[current_state]["next_states"]:
if self.adj_list[node]["value"] == value:
return node
return None
def add_keyword(self, keyword):
""" add a keyword to the trie and mark output at the last node """
current_state = 0
j = 0
keyword = keyword.lower()
child = self.find_next_state(current_state, keyword[j])
while child != None:
current_state = child
j = j + 1
if j < len(keyword):
child = self.find_next_state(current_state, keyword[j])
else:
break
for i in range(j, len(keyword)):
node = {'value':keyword[i],'next_states':[],'fail_state':0,'output':[]}
self.adj_list.append(node)
self.adj_list[current_state]["next_states"].append(len(self.adj_list) - 1)
current_state = len(self.adj_list) - 1
self.adj_list[current_state]["output"].append(keyword)
def set_fail_transitions(self):
q = deque()
child = 0
for node in self.adj_list[0]["next_states"]:
q.append(node)
self.adj_list[node]["fail_state"] = 0
while q:
r = q.popleft()
for child in self.adj_list[r]["next_states"]:
q.append(child)
state = self.adj_list[r]["fail_state"]
while (self.find_next_state(state, self.adj_list[child]["value"]) == None
and state != 0):
state = self.adj_list[state]["fail_state"]
self.adj_list[child]["fail_state"] = self.find_next_state(state, self.adj_list[child]["value"])
if self.adj_list[child]["fail_state"] is None:
self.adj_list[child]["fail_state"] = 0
self.adj_list[child]["output"] = self.adj_list[child]["output"] + self.adj_list[self.adj_list[child]["fail_state"]]["output"]
def get_keywords_found(self, line):
""" returns keywords in trie from line """
line = line.lower()
current_state = 0
keywords_found = []
for i, c in enumerate(line):
while self.find_next_state(current_state, c) is None and current_state != 0:
current_state = self.adj_list[current_state]["fail_state"]
current_state = self.find_next_state(current_state, c)
if current_state is None:
current_state = 0
else:
for j in self.adj_list[current_state]["output"]:
yield {"index":i-len(j) + 1,"word":j}
def pattern_found(self, line):
''' Returns true when the pattern is found '''
return next(self.get_keywords_found(line), None) is not None
def combs(word, n = 0, path = ""):
''' Generate all combinations of words with star
e.g. list(combs("he*lp*")) = ['help', 'helpp', 'heelp', 'heelpp']
'''
if n == len(word):
yield path
elif word[n] == '*':
# Next letter
yield from combs(word, n+1, path) # don't add * to path
else:
if n < len(word) - 1 and word[n+1] == '*':
yield from combs(word, n+1, path) # Not including letter at n
yield from combs(word, n+1, path + word[n]) # including letter at n
Test
patterns = combs("dogs*andcats") # ['dogandcats', 'dogsandcats']
aho = Aho_Corasick(patterns) # Aho-Corasick structure to recognize patterns
print(aho.pattern_found("dogandcats")) # Output: True
print(aho.pattern_found("dogsandcats")) # Output: True