pythonstringalgorithmdata-structuresstring-search

Python: Find If Substring Exists in String Given Condition


I'm trying to optimize this solution for a function that accepts 2 arguments: fullstring and substring. The function will return True if the substring exists in the fullstring, and False if it does not. There is one special wildcard that could be entered in the substring that denotes 0 or 1 of the previous symbol, and there can be more than one wildcard in the substring.

For example, "a*" means "" or "a"

The solution I have works fine but I'm trying to reduce the number of for loops (3) and optimize for time complexity. Using regex is not permitted. Is there a more pythonic way to do this?

Current Solution:

def complex_search(fullstring, substring):
    patterns = []
    if "*" in substring:
        index = substring.index("*")
        patterns.append(substring[:index-1] + substring[index+1:])
        patterns.append(substring[:index] + substring[index+1:])
    else:
        patterns.append(substring)

    def check(s1, s2):
        for a, b in zip(s1, s2):
            if a != b:
                return False
        return True

    for pattern in patterns:
        for i in range(len(fullstring) - len(pattern) + 1):
            if check(fullstring[i:i+len(pattern)], pattern):
                return True
    return False

>> print(complex_search("dogandcats", "dogs*andcats"))
>> True

Solution

  • Approach

    1. Create all alternatives for the substring based upon '" in substring (can have zero or more '' in substring)

      See Function combs(...) below

    2. Use Aho-Corasick to check if one of the substring patterns is in the string. Aho-Corasick is a very efficient algorithm for checking if one or more substrings appear in a string and formed as the basis of the original Unix command fgrep.

    3. For illustrative purposes a Python version of Aho-Corasik is used below, but a C implementation (with Python wrapper) is available at pyahocorasick for higher performance.

      See class Aho-Corasick below.

    Code

    # Note: This is a modification of code explained in https://carshen.github.io/data-structures/algorithms/2014/04/07/aho-corasick-implementation-in-python.html
    
    from collections import deque
    
    class Aho_Corasick():
        def __init__(self, keywords):
            self.adj_list = []
            # creates a trie of keywords, then sets fail transitions
            self.create_empty_trie()
            self.add_keywords(keywords)
            self.set_fail_transitions()
    
        def create_empty_trie(self):
            """ initalize the root of the trie """
            self.adj_list.append({'value':'', 'next_states':[],'fail_state':0,'output':[]})
    
        def add_keywords(self, keywords):
            """ add all keywords in list of keywords """
            for keyword in keywords:
                self.add_keyword(keyword)
    
        def find_next_state(self, current_state, value):
            for node in self.adj_list[current_state]["next_states"]:
                if self.adj_list[node]["value"] == value:
                    return node
            return None
    
        def add_keyword(self, keyword):
            """ add a keyword to the trie and mark output at the last node """
            current_state = 0
            j = 0
            keyword = keyword.lower()
            child = self.find_next_state(current_state, keyword[j])
            while child != None:
                current_state = child
                j = j + 1
                if j < len(keyword):
                    child = self.find_next_state(current_state, keyword[j])
                else:
                    break
    
            for i in range(j, len(keyword)):
                node = {'value':keyword[i],'next_states':[],'fail_state':0,'output':[]}
                self.adj_list.append(node)
                self.adj_list[current_state]["next_states"].append(len(self.adj_list) - 1)
                current_state = len(self.adj_list) - 1
            self.adj_list[current_state]["output"].append(keyword)
    
        def set_fail_transitions(self):
            q = deque()
            child = 0
            for node in self.adj_list[0]["next_states"]:
               q.append(node)
               self.adj_list[node]["fail_state"] = 0
            while q:
                r = q.popleft()
                for child in self.adj_list[r]["next_states"]:
                    q.append(child)
                    state = self.adj_list[r]["fail_state"]
                    while (self.find_next_state(state, self.adj_list[child]["value"]) == None
                          and state != 0):
                        state = self.adj_list[state]["fail_state"]
                    self.adj_list[child]["fail_state"] = self.find_next_state(state, self.adj_list[child]["value"])
                    if self.adj_list[child]["fail_state"] is None:
                        self.adj_list[child]["fail_state"] = 0
                    self.adj_list[child]["output"] = self.adj_list[child]["output"] + self.adj_list[self.adj_list[child]["fail_state"]]["output"]
    
        def get_keywords_found(self, line):
            """ returns keywords in trie from line """
            line = line.lower()
            current_state = 0
            keywords_found = []
    
            for i, c in enumerate(line):
                while self.find_next_state(current_state, c) is None and current_state != 0:
                    current_state = self.adj_list[current_state]["fail_state"]
                current_state = self.find_next_state(current_state, c)
                if current_state is None:
                    current_state = 0
                else:
                    for j in self.adj_list[current_state]["output"]:
                        yield {"index":i-len(j) + 1,"word":j}
        
        def pattern_found(self, line):
            ''' Returns true when the pattern is found '''
            return next(self.get_keywords_found(line), None) is not None
        
            
    def combs(word, n = 0, path = ""):
        ''' Generate all combinations of words with star
        
            e.g. list(combs("he*lp*")) = ['help', 'helpp', 'heelp', 'heelpp']
        '''
        if n == len(word):
            yield path
        elif word[n] == '*':
            # Next letter
            yield from combs(word, n+1, path)            # don't add * to path
        else:
            if n < len(word) - 1 and word[n+1] == '*':
                yield from combs(word, n+1, path)        # Not including letter at n
            yield from combs(word, n+1, path + word[n])  # including letter at n
        
    

    Test

    patterns = combs("dogs*andcats")                    # ['dogandcats', 'dogsandcats']
    aho = Aho_Corasick(patterns)                        # Aho-Corasick structure to recognize patterns
    print(aho.pattern_found("dogandcats"))              # Output: True
    print(aho.pattern_found("dogsandcats"))             # Output: True