pythonstringalgorithmsliding-window

Find substrings that contain a certain amount of certain characters


I have a string like GGACATCGCGGTGGATCGAC.

How can I find all substrings that contain, for example, three Gs in any order?

For this string, it would be GCGG or GTGG, or GGACATCG.

Also need to find such substrings for several letters, like, substrings with two Gs and one C (CGAG, GGAC, GGATC, CGG)


Solution

  • Sliding Window:

    Code

    def _sliding_window(s, char_a, char_b, count_a, count_b):
        res = []
        starts = []
    
        for i in range(len(s)):
            char = s[i]
    
            if char in (char_a, char_b):
                starts.append(i)
    
        for start in starts:
            As, Bs = 0, 0
            temp = ""
    
            for i in range(start, len(s)):
                char = s[i]
    
                if char == char_a:
                    As += 1
                if char == char_b:
                    Bs += 1
    
                temp += char
    
                if As == count_a and Bs == count_b:
                    res.append(temp)
                    break
    
        return res
    
    
    s = "GGACATCGCGGTGGATCGCCACGCGACATCGCGGTGGATCGACGGACATCCGCGGTGCGATCCGACGACATGCGCGCG"
    
    print(_sliding_window(s, 'G', 'C', 2, 1), "\n")
    print(_sliding_window(s, 'G', 'C', 3, 0), "\n")
    print(_sliding_window(s, 'G', 'C', 1, 2), "\n")
    print(_sliding_window(s, 'G', 'C', 0, 3), "\n")
    
    
    
    

    Note:

    Prints

    ['GGAC', 'GCG', 'CGG', 'GGATC', 'GATCG', 'GCG', 'GCG', 'CGG', 'GGATC', 'GATCG', 'GACG', 'CGG', 'GGAC', 'GCG', 'CGG', 'GTGC', 'GCG', 'GACG', 'GACATG', 'GCG', 'GCG', 'GCG'] 
    
    ['GGTG', 'GTGG', 'GGTG', 'GTGG', 'GGTG'] 
    
    ['GACATC', 'CATCG', 'CGC', 'CGC', 'GCC', 'CACG', 'CGC', 'CGAC', 'GACATC', 'CATCG', 'CGC', 'CGAC', 'GACATC', 'CCG', 'CGC', 'CGATC', 'GATCC', 'CCG', 'CGAC', 'CGAC', 'CATGC', 'CGC', 'CGC'] 
    
    ['CCAC', 'CATCC'] 
    
    

    Regex Pattern

    It'd be difficult to write pattern for each and every case. But the way to write a pattern is as follows.

    For 3G in any order:

    (?=.*G.*G.*G)([G].*?[G].*?[G])
    

    Code

    import re
    
    p = r'(?=.*G.*G.*G)([G].*?[G].*?[G])'
    
    s = """
    GGACATCGCGGTGGATCGACGGACATCGCGGTGGATCGACGGACATCGCGGTGGATCGAC GACATGCGCGCG. GGACATGCGCGCGGGACATGCGCGCG. GGACATGCGCGCG. 
    GCGG or GTGG, or GGACATCG. GCGG or GTGG, or GGACATGCGCGCG. GGACATGCGCGCG. GGACATGCGCGCGGGACATGCGCGCG. GGACATGCGCGCG. 
    GCGG or GTGG, or GGACATCG. GCGG or GTGG, or GGACATGCGCGCG. GGACATGCGCGCG. GGACATGCGCGCGGGACATGCGCGCG. GGACATGCGCGCG. 
    """
    
    find_3G = re.findall(p, s)
    
    print(find_3G)
    
    # Note that 'GCG. G' is also part of those found.
    
    
    

    Prints

    ['GGACATCG', 'GGTG', 'GATCGACG', 'GACATCGCG', 'GTGG', 'GACGG', 'GCGG', 'GGATCG', 'GACATGCG', 'GCG. G', 'GACATGCG', 'GCGG', 'GACATGCG', 'GCG. G', 'GACATGCG', 'GCGG', 'GTGG', 'GGACATCG', 'GCGG', 'GTGG', 'GGACATG', 'GCGCG', 'GGACATG', 'GCGCG', 'GGACATG', 'GCGCG', 'GGACATG', 'GCGCG', 'GGACATG', 'GCGCG', 'GCGG', 'GTGG', 'GGACATCG', 'GCGG', 'GTGG', 'GGACATG', 'GCGCG', 'GGACATG', 'GCGCG', 'GGACATG', 'GCGCG', 'GGACATG', 'GCGCG', 'GGACATG', 'GCGCG']


    A naive brute force algorithm for two Gs and one Cs would be the following O(N ^ 2):

    def _pattern(s):
        return s.count('G') == 2 and s.count('C') == 1
    
    def _collect(s):
        res = []
        for i in range(len(s)):
            for j in range(i + 1, len(s) + 1):
                part = s[i:j]
                if _pattern(part):
                    res.append(part)
        return res
    
    
    s = """
    GGACATCGCGGTGGATCGACGGACATCGCGGTGGATCGACGGACATCGCGGTGGATCGAC GACATGCGCGCG. GGACATGCGCGCGGGACATGCGCGCG. GGACATGCGCGCG. 
    GCGG or GTGG, or GGACATCG. GCGG or GTGG, or GGACATGCGCGCG. GGACATGCGCGCG. GGACATGCGCGCGGGACATGCGCGCG. GGACATGCGCGCG. 
    GCGG or GTGG, or GGACATCG. GCGG or GTGG, or GGACATGCGCGCG. GGACATGCGCGCG. GGACATGCGCGCGGGACATGCGCGCG. GGACATGCGCGCG. 
    """
    G, C = 2, 1
    
    print(_collect(s))
    
    

    Prints

    ['\nGGAC', '\nGGACA', '\nGGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GCG', 'CGG', 'CGGT', 'TGGATC', 'GGATC', 'GATCG', 'GATCGA', 'GACG', 'ACGG', 'ACGGA', 'CGG', 'CGGA', 'GGAC', 'GGACA', 'GGACAT', 'GCG', 'CGG', 'CGGT', 'TGGATC', 'GGATC', 'GATCG', 'GATCGA', 'GACG', 'ACGG', 'ACGGA', 'CGG', 'CGGA', 'GGAC', 'GGACA', 'GGACAT', 'GCG', 'CGG', 'CGGT', 'TGGATC', 'GGATC', 'GATCG', 'GATCGA', 'GAC G', 'GAC GA', ' GACATG', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'CG. G', '. GGAC', '. GGACA', '. GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'CGG', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'CG. G', '. GGAC', '. GGACA', '. GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'GCG. \n', 'CG. \nG', 'G. \nGC', '. \nGCG', ' \nGCG', '\nGCG', 'GCG', 'CGG', 'CGG ', 'CGG o', 'CGG or', 'CGG or ', ', or GGAC', ', or GGACA', ', or GGACAT', ' or GGAC', ' or GGACA', ' or GGACAT', 'or GGAC', 'or GGACA', 'or GGACAT', 'r GGAC', 'r GGACA', 'r GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'ATCG. G', 'TCG. G', 'CG. G', 'G. GC', '. GCG', ' GCG', 'GCG', 'CGG', 'CGG ', 'CGG o', 'CGG or', 'CGG or ', ', or GGAC', ', or GGACA', ', or GGACAT', ' or GGAC', ' or GGACA', ' or GGACAT', 'or GGAC', 'or GGACA', 'or GGACAT', 'r GGAC', 'r GGACA', 'r GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'CG. G', '. GGAC', '. GGACA', '. GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'CG. G', '. GGAC', '. GGACA', '. GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'CGG', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'CG. G', '. GGAC', '. GGACA', '. GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'GCG. \n', 'CG. \nG', 'G. \nGC', '. \nGCG', ' \nGCG', '\nGCG', 'GCG', 'CGG', 'CGG ', 'CGG o', 'CGG or', 'CGG or ', ', or GGAC', ', or GGACA', ', or GGACAT', ' or GGAC', ' or GGACA', ' or GGACAT', 'or GGAC', 'or GGACA', 'or GGACAT', 'r GGAC', 'r GGACA', 'r GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'ATCG. G', 'TCG. G', 'CG. G', 'G. GC', '. GCG', ' GCG', 'GCG', 'CGG', 'CGG ', 'CGG o', 'CGG or', 'CGG or ', ', or GGAC', ', or GGACA', ', or GGACAT', ' or GGAC', ' or GGACA', ' or GGACAT', 'or GGAC', 'or GGACA', 'or GGACAT', 'r GGAC', 'r GGACA', 'r GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'CG. G', '. GGAC', '. GGACA', '. GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'CG. G', '. GGAC', '. GGACA', '. GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'CGG', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'CG. G', '. GGAC', '. GGACA', '. GGACAT', ' GGAC', ' GGACA', ' GGACAT', 'GGAC', 'GGACA', 'GGACAT', 'GACATG', 'ATGCG', 'TGCG', 'GCG', 'GCG', 'GCG', 'GCG.', 'GCG. ', 'GCG. \n']


    Note