pythonregexparsingspeechphoneme

Python pattern matching of rules to parse text to phonemes


I have a set of rules that can be used to convert text to a set of phonemes. The application of these rules would result in conversions such as the following:

a            uh
ability      ae-b-ih-l-ih-t-ee
aboard       uh-b-oh-r-d
abort        uh-b-oh-r-t
affirmative  ah-f-eh-r-m-ah-t-ih-v
all          aw-l
alter        ah-l-t-r
an           ae-n
and          ae-n-d
Andy         ae-n-d-ee
any          eh-n-ee
anybody      ae-n-ee-b-ah-d-ee
at           ae-t
attacked     uh-t-ae-k-t

I want to create a function that can be applied to text and return the phonemes corresponding to this text using the conversion rules.

A rule consists of a few parts. The first part is the text token under consideration. The second part is the text token found before the token under consideration. The third part is the text token found after the token under consideration. The fourth part is the appropriate phoneme that should result in the conversion. Rules can be written in the following way, with the different parts separated by slashes:

text found/text before text found/text after text found/phoneme

Given rules of this form, what would be a good way to apply them to strings of text? I want to try to build a function that can parse text to find a rule match.


Rules are as follows:

#  one or more vowels (AEIOUY)
+  one of E, I, Y (a front vowel)
:  zero or more consonants (BCDFGHJKLMNPQRSTVWXZ)
^  one consonant
.  one of B, V, D, G, J, L, M, N, R, W, Z (a voiced consonant)
%  one of ER, E, ES, ED, ING, ELY (a suffix)
&  one of S, C, G, Z, X, J, CH, SH (a siblant)
@  one of T, S, R, D, L, Z, N, J, TH, CH, SH (a consonant influencing following u)

" /// "
"A// /UH"
"ARE/ / /AH-R"
"AR/ /O/UH-R"
"AR//#/EH-R"
"AS/ ^/#/AE-A-S"
"A//WA/UH"
"AW///AW"
"ANY/ ://EH-N-EE"
"A//^+#/AE-A"
"ALLY/#://UH-L-EE"
"AL/ /#/UH-L"
"AGAIN///UH-G-EH-N"
"AG/#:/E/IH-J"
"A//^+:#/AE"
"A/ :/^+/AE-A"
"ARR/ //UH-R"
"ARR///AE-R"
"AR/ ://AH-R"
"AR// /AE-R"
"AR///AH-R"
"AIR///EH-R"
"AI///AE-A"
"AY///AE-A"
"AU///AW"
"AL/#:/ /UH-L"
"ALS/#:/ /UH-L-Z"
"ALK///AW-K"
"AL//^/AW-L"
"ABLE/ ://AE-A-B-UH-L"
"ABLE///UH-B-UH-L"
"ANG//+/AE-A-N-J"
"ATHE/ C/ /AE-TH-EE"
"A//A/AH"
"A///AE"
"BE/ /^#/B-IH"
"BEING///B-EE-IH-N"
"BOTH/ / /B-OH-TH"
"BUS/ /#/B-IH-Z"
"BUIL///B-IH-L"
"B/ / /B-EE"
"B///B"
"CH/ /^/K"
"CH/^E//K"
"CH///CH"
"CI/ S/#/S-AH-EE"
"CI//A/SH"
"CI//O/SH"
"CI//EN/SH"
"C//+/S"
"CK///K"
"COM//%/K-AH-M"
"C/ / /S-EE"
"C///K"
"DED/#:/ /D-IH-D"
"D/.E/ /D"
"D/#^:E/ /T"
"DE/ /^#/D-IH"
"DO/ / /D-OO"
"DOES/ //D-UH-Z"
"DOING/ //D-OO-IH-N"
"DOW/ //D-OH"
"DU//A/J-OO"
"D/ / /D-EE"
"DOUGH///D-OH"
"D///D"
"E/#:/ /"
"E/'^:/ /"
"E/ :/ /EE"
"ED/#/ /D"
"E/#:/D /"
"ER//EV/EH-V"
"EVEN/ EL//EH-V-EH-N"
"EVEN/ S//EH-V-EH-N"
"E//^%/EE"
"E//PH%/EE"
"ERI//#/EE-R-EE"
"ER/#:/#/AE-R"
"ER//#/EH-R"
"ER///AE-R"
"EVEN/ //EE-V-EH-N"
"E/#:/W/"
"EW/@//OO"
"EW///Y-OO"
"E//O/EE"
"ES/#:&/ /IH-Z"
"E/#:/S /"
"ELY/#://L-EE"
"EMENT/#://M-EH-N-T"
"EFUL///F-U-L"
"EE///EE"
"EARN///AE-R-N"
"EAR/ /^/AE-R"
"EAD///EH-D"
"EA/#:/ /EE-UH"
"EA//SU/EH"
"EA///EE"
"EIGH///AE-A"
"EI///EE"
"EYE/ //AH-EE"
"EY///EE"
"EU///Y-OO"
"E/ / /EE"
"E/^/ /"
"E///EH"
"FUL///F-U-L"
"F/F//"
"F/ / /EH-F"
"F///F"
"GIV///G-IH-V"
"G/ /I^/G"
"GE//T/G-EH"
"GGES/SU//G-J-EH-SS"
"G/G//"
"G/ B#//G"
"G//+/J"
"GREAT///G-R-AE-A-T"
"GH/#//"
"G/ / /G-EE"
"G///G"
"HAV/ //H-AE-V"
"HERE/ //H-EE-R"
"HOUR/ //OH-AE-R"
"HOW///H-OH"
"H//#/H"
"H/ / /H-AE-CH"
"H///"
"IN/ //IH-N"
"I/ / /AH-EE"
"IN//D/IH-N"
"IER///EE-AE-R"
"IED/#:R//EE-D"
"IED// /AH-EE-D"
"IEN///EE-EH-N"
"IE//T/AH-EE-EH"
"I/ :/%/AH-EE"
"I//%/EE"
"IE///EE"
"INE/N//AH-EE-N"
"IME/T//AH-EE-M"
"I//^+:#/IH"
"IR//#/AH-EE-R"
"IS//%/AH-EE-S"
"IX//%/IH-K-S"
"IZ//%/AH-EE-Z"
"I//D%/AH-EE"
"I/+^/^+/IH"
"I//T%/AH-EE"
"I/#^:/^+/IH"
"I//^+/AH-EE"
"IR///AE-R"
"IGH///AH-EE"
"ILD///AH-EE-L-D"
"IGN// /AH-EE-N"
"IGN//^/AH-EE-N"
"IGN//%/AH-EE-N"
"IQUE///EE-K"
"I///IH"
"J/ / /J-A-EE"
"J///J"
"K//N/"
"K/ / /K-A-EE"
"K///K"
"LO//C#/L-OH"
"L/L//"
"L/#^:/%/UH-L"
"LEAD///L-EE-D"
"L/ / /AE-L"
"L///L"
"MOV///M-OO-V"
"M/ / /EH-M"
"M///M"
"NG/E/+/N-J"
"NG//R/N"
"NG//#/N"
"NGL//%/N-UH-L"
"NG///N"
"NK///N-K"
"NOW/ / /N-OH"
"N/ / /EH-N"
"N/N//"
"N///N"
"OF// /UH-V"
"OROUGH///AE-R-OH"
"OR/ F/TY/OH-R"
"OR/#:/ /AE-R"
"ORS/#:/ /AE-R-Z"
"OR///AW-R"
"ONE/ //W-UH-N"
"OW//EL/OH"
"OW///OH"
"OVER/ //OH-V-AE-R"
"OV///UH-V"
"O//^%/OH"
"O//^EN/OH"
"O//^I#/OH"
"OL//D/OH-L"
"OUGHT///AH-T"
"OUGH///UH-F"
"OU/ /^L/UH"
"OU/ //OH"
"OU/H/S#/OH"
"OUS///UH-S"
"OUR/ F//OH-R"
"OUR///AW-R"
"OUD///U-D"
"OUP///OO-P"
"OU///OH"
"OY///AW-EE"
"OING///OH-IH-N"
"OI///AW-EE"
"OOR///OH-R"
"OOK///U-K"
"OOD///U-D"
"OO///OO"
"O//E/OH"
"O// /OH"
"OA// /OH"
"ONLY/ //OH-N-L-EE"
"ONCE/ //W-UH-N-S"
"ON'T// /OH-N-T"
"O/C/N/AH"
"O//NG/AH"
"O/^:/N/UH"
"ON/I//UH-N"
"ON/#:/ /UH-N"
"ON/#^//UH-N"
"O//ST /OH"
"OF//^/AW-F"
"OTHER///UH-TH-AE-R"
"OSS// /AW-S"
"OM/#^:/ /UH-M"
"O///AH"
"PH///F"
"PEOP///P-EE-P"
"POW///P-OH"
"PUT// /P-U-T"
"P/ / /P-EE"
"P/P//"
"P///P"
"QUAR///K-W-AW-R"
"QU/ //K-W"
"QU///K"
"Q/ / /K-OO"
"Q///K"
"RE/ /^#/R-EE"
"R/ / /AH"
"R/R//"
"R///R"
"SH///SH"
"SION/#//ZH-UH-N"
"SOME///S-AH-M"
"SUR/#/#/ZH-AE-R"
"SUR//#/SH-AE-R"
"SU/#/#/ZH-OO"
"SSU/#/#/SH-OO"
"SED/#/ /Z-D"
"S/#/#/Z"
"SAID///S-EH-D"
"SION/^//SH-UH-N"
"S/S//"
"S/./ /Z"
"S/#:.E/ /Z"
"S/#^:##/ /Z"
"S/#^:#/ /S"
"S/U/ /S"
"S/ :#/ /Z"
"SCH/ //S-K"
"S//C+/"
"SM/#//Z-M"
"SN/#/ /Z-UH-N"
"S/ / /EH-S"
"S///S"
"THE/ / /TH-UH"
"TO// /T-OO"
"THAT///TH-AE-T"
"THIS/ / /TH-IH-S"
"THEY/ //TH-AE-A"
"THERE/ //TH-EH-R"
"THER///TH-AE-R"
"THEIR///TH-EH-EH"
"THAN/ / /TH-AE-N"
"THEM/ / /TH-EH-M"
"THESE// /TH-EE-Z"
"THEN/ //TH-EH-N"
"THROUGH///TH-R-OO"
"THOSE///TH-OH-Z"
"THOUGH// /TH-OH"
"THUS/ //TH-UH-S"
"TH///TH"
"TED/#:/ /T-IH-D"
"TI/S/#N/CH"
"TI//O/SH"
"TI//A/T"
"TIEN///SH-UH-N"
"TUR//#/CH-AE-R"
"TU//A/CH-OO"
"TWO/ //T-OO"
"T/ / /T-EE"
"T/T//"
"T///T"
"UN/ /I/Y-OO-N"
"UN/ //UH-N"
"UPON/ //UH-P-AW-N"
"UR/@/#/AE-R"
"UR//#/Y-AE-R"
"UR///AE-R"
"U//^ /UH"
"U//^^/UH"
"UY///AH-EE"
"U/ G/#/"
"U/G/%/"
"U/G/#/W"
"U/#N//Y-OO"
"UI/@//OO"
"U/@//UH"
"U///Y-OO"
"VIEW///V-Y-OO"
"V/ / /V-EE"
"V///V"
"WHERE/ //W-AE-R"
"WA//S/W-AH"
"WA//T/W-AH"
"WHERE///WH-EH-R"
"WHAT///WH-AH-T"
"WHOL///H-OH-L"
"WHO///H-OO"
"WH///WH"
"WAR///W-AH-R"
"WOR///W-AE-R"
"WR///R"
"W/ / /D-AH-B-L-Y-OO"
"W///W"
"X//^/EH-K-S"
"X/ / /EH-K-S"
"X/ /#/Z-EH"
"X///K-S"
"YOUNG///Y-UH-N"
"YOU/ //Y-OO"
"YES/ //Y-EH-S"
"Y/ / /WH-UH-Y"
"Y/ //Y"
"Y/#^:/ /EE"
"Y/#^:/I/EE"
"Y/ :/ /AH-EE"
"Y/ :/#/AH-EE"
"Y/ :/^+:#/IH"
"Y/ :/^#/AH-EE"
"Y///IH"
"ZZ///T-Z"
"Z/ / /Z-EH-D"
"Z///Z"

Solution

  • Turns out that lookbehind requires the pattern to be of fixed size, which doesn't fit with your rules, so we have to be a little bit more complex.

    First let's define a translation between your syntax and regex:

    rule_syntax = {
        '#': r'[AEIOUY]+',
        '+': r'[EIY]',
        ':': r'[BCDFGHJKLMNPQRSTVWXZ]*',
        '^': r'[BCDFGHJKLMNPQRSTVWXZ]',
        '.': r'[BVDGJLMNRWZ]',
        '%': r'(?:ER|E|ES|ED|ING|EL)',
        '&': r'(?:[SCGZXJ]|CH|SH)',
        '@': r'(?:[TSRDLZNJ]|TH|CH|SH)',
    }
    

    and a function to create a regex fragment from this mapping:

    def mkregex(rule):
        regex = r""
        for ch in rule:
            regex += rule_syntax.get(ch, ch)
        return regex
    

    I'm not sure how you want to handle rules with spaces, I've commented out the ' /// ' rule to get the results below.

    Now we implement a function that converts your rule syntax into an "interesting" tuple:

    def mkrule(ruletxt):
        txt, before, after, phoneme = ruletxt.split('/')
        rule = r""
    
        if before:
            # use a non-capturing group to match the 'before' text
            rule += r'(?:' + mkregex(before) + ')'
    
        # create a capturing group for the text in question
        rule += r'(?P<found>' + txt + ')'  
    
        if after:
            # add a lookahead pattern
            rule += r'(?=' + mkregex(after) + ')'
    
        # return a tuple containing
        #   - the regex created from the rule
        #   - a lower-cased version of the phonemes between dashes
        #   - the original rule (for explaining and debugging)
        return rule, "-%s-" % phoneme.lower(), ruletxt
    

    The approach we will take is to iteratively replace matched rules with phonemes. To make sure we don't replace text that has already been converted (i.e. phonemes), we will make the input string upper cased, and the phonemes lower cased. To prevent the phonemes from running into each other we've added a - on each side (we'll have to clean this up at the end).

    Convert all your rules to interesting tuples:

    rules = [mkrule(r) for r in [
        #" /// ",          # this rule creates problems
        "A// /UH",
        "ARE/ / /AH-R",
        "AR/ /O/UH-R",
        "AR//#/EH-R",
        "AS/ ^/#/AE-A-S",
        "A//WA/UH",
        "AW///AW",
        ...
    ]]
    

    We're almost there, just a function to replace the found text from a single rule:

    def match_and_replace(word, rule, phonemes):
        # a rule can match multiple times, find all of them
        matches = [(m.start(), m.end()) for m in re.finditer(rule, word)]
        matches.reverse()  # we're going to replace in-place, so start from behind
        chars = list(word)  # convert to list of chars since strings are immutable
        for start, end in matches:
            chars[start:end] = phonemes
        return ''.join(chars)  # convert back to string
    

    Finally, the function to extract 'phonemes' from a word:

    def phonemes(word, explain=False):
        # rule engines should always be able to explain their results ;-)
        if explain:
            print "word  :", word
    
        result = " %s " % word.upper()  # add space around word to give the rules containing spaces something to work with
        step = 0
    
        # iterate over all the interesting tuples
        for rule, phoneme, ruletxt in rules:
            # for each rule, tmp is the string where all matches for `rule` have been replaced by `phoneme`
            tmp = match_and_replace(result, rule, phoneme)
            if explain and tmp != result:
                step += 1
                print 'step %d: %r ---> %r  [rule: %r (%r)]' % (
                    step, result, tmp, ruletxt, rule
                )
            result = tmp
    
        # remove artifacts
        res, _count = re.subn(r'-+', '-', result.replace(' ', '').strip('-'))
        if explain:
            print "result:", res
            print
        return res
    

    With this I get the following results:

    >>> phonemes('abort', explain=True)
    word  : abort
    step 1: ' ABORT ' ---> ' -ae-BORT '  [rule: 'A///AE' ('(?P<found>A)')]
    step 2: ' -ae-BORT ' ---> ' -ae--b-ORT '  [rule: 'B///B' ('(?P<found>B)')]
    step 3: ' -ae--b-ORT ' ---> ' -ae--b--aw-r-T '  [rule: 'OR///AW-R' ('(?P<found>OR)')]
    step 4: ' -ae--b--aw-r-T ' ---> ' -ae--b--aw-r--t- '  [rule: 'T///T' ('(?P<found>T)')]
    result: ae-b-aw-r-t
    

    You'll need to order the rules sensibly to get the results you want, or use more complex algorithms that can find all possible rule-permutations that match and then find the best one.