I have a set of rules that can be used to convert text to a set of phonemes. The application of these rules would result in conversions such as the following:
a uh
ability ae-b-ih-l-ih-t-ee
aboard uh-b-oh-r-d
abort uh-b-oh-r-t
affirmative ah-f-eh-r-m-ah-t-ih-v
all aw-l
alter ah-l-t-r
an ae-n
and ae-n-d
Andy ae-n-d-ee
any eh-n-ee
anybody ae-n-ee-b-ah-d-ee
at ae-t
attacked uh-t-ae-k-t
I want to create a function that can be applied to text and return the phonemes corresponding to this text using the conversion rules.
A rule consists of a few parts. The first part is the text token under consideration. The second part is the text token found before the token under consideration. The third part is the text token found after the token under consideration. The fourth part is the appropriate phoneme that should result in the conversion. Rules can be written in the following way, with the different parts separated by slashes:
text found/text before text found/text after text found/phoneme
Given rules of this form, what would be a good way to apply them to strings of text? I want to try to build a function that can parse text to find a rule match.
Rules are as follows:
# one or more vowels (AEIOUY)
+ one of E, I, Y (a front vowel)
: zero or more consonants (BCDFGHJKLMNPQRSTVWXZ)
^ one consonant
. one of B, V, D, G, J, L, M, N, R, W, Z (a voiced consonant)
% one of ER, E, ES, ED, ING, ELY (a suffix)
& one of S, C, G, Z, X, J, CH, SH (a siblant)
@ one of T, S, R, D, L, Z, N, J, TH, CH, SH (a consonant influencing following u)
" /// "
"A// /UH"
"ARE/ / /AH-R"
"AR/ /O/UH-R"
"AR//#/EH-R"
"AS/ ^/#/AE-A-S"
"A//WA/UH"
"AW///AW"
"ANY/ ://EH-N-EE"
"A//^+#/AE-A"
"ALLY/#://UH-L-EE"
"AL/ /#/UH-L"
"AGAIN///UH-G-EH-N"
"AG/#:/E/IH-J"
"A//^+:#/AE"
"A/ :/^+/AE-A"
"ARR/ //UH-R"
"ARR///AE-R"
"AR/ ://AH-R"
"AR// /AE-R"
"AR///AH-R"
"AIR///EH-R"
"AI///AE-A"
"AY///AE-A"
"AU///AW"
"AL/#:/ /UH-L"
"ALS/#:/ /UH-L-Z"
"ALK///AW-K"
"AL//^/AW-L"
"ABLE/ ://AE-A-B-UH-L"
"ABLE///UH-B-UH-L"
"ANG//+/AE-A-N-J"
"ATHE/ C/ /AE-TH-EE"
"A//A/AH"
"A///AE"
"BE/ /^#/B-IH"
"BEING///B-EE-IH-N"
"BOTH/ / /B-OH-TH"
"BUS/ /#/B-IH-Z"
"BUIL///B-IH-L"
"B/ / /B-EE"
"B///B"
"CH/ /^/K"
"CH/^E//K"
"CH///CH"
"CI/ S/#/S-AH-EE"
"CI//A/SH"
"CI//O/SH"
"CI//EN/SH"
"C//+/S"
"CK///K"
"COM//%/K-AH-M"
"C/ / /S-EE"
"C///K"
"DED/#:/ /D-IH-D"
"D/.E/ /D"
"D/#^:E/ /T"
"DE/ /^#/D-IH"
"DO/ / /D-OO"
"DOES/ //D-UH-Z"
"DOING/ //D-OO-IH-N"
"DOW/ //D-OH"
"DU//A/J-OO"
"D/ / /D-EE"
"DOUGH///D-OH"
"D///D"
"E/#:/ /"
"E/'^:/ /"
"E/ :/ /EE"
"ED/#/ /D"
"E/#:/D /"
"ER//EV/EH-V"
"EVEN/ EL//EH-V-EH-N"
"EVEN/ S//EH-V-EH-N"
"E//^%/EE"
"E//PH%/EE"
"ERI//#/EE-R-EE"
"ER/#:/#/AE-R"
"ER//#/EH-R"
"ER///AE-R"
"EVEN/ //EE-V-EH-N"
"E/#:/W/"
"EW/@//OO"
"EW///Y-OO"
"E//O/EE"
"ES/#:&/ /IH-Z"
"E/#:/S /"
"ELY/#://L-EE"
"EMENT/#://M-EH-N-T"
"EFUL///F-U-L"
"EE///EE"
"EARN///AE-R-N"
"EAR/ /^/AE-R"
"EAD///EH-D"
"EA/#:/ /EE-UH"
"EA//SU/EH"
"EA///EE"
"EIGH///AE-A"
"EI///EE"
"EYE/ //AH-EE"
"EY///EE"
"EU///Y-OO"
"E/ / /EE"
"E/^/ /"
"E///EH"
"FUL///F-U-L"
"F/F//"
"F/ / /EH-F"
"F///F"
"GIV///G-IH-V"
"G/ /I^/G"
"GE//T/G-EH"
"GGES/SU//G-J-EH-SS"
"G/G//"
"G/ B#//G"
"G//+/J"
"GREAT///G-R-AE-A-T"
"GH/#//"
"G/ / /G-EE"
"G///G"
"HAV/ //H-AE-V"
"HERE/ //H-EE-R"
"HOUR/ //OH-AE-R"
"HOW///H-OH"
"H//#/H"
"H/ / /H-AE-CH"
"H///"
"IN/ //IH-N"
"I/ / /AH-EE"
"IN//D/IH-N"
"IER///EE-AE-R"
"IED/#:R//EE-D"
"IED// /AH-EE-D"
"IEN///EE-EH-N"
"IE//T/AH-EE-EH"
"I/ :/%/AH-EE"
"I//%/EE"
"IE///EE"
"INE/N//AH-EE-N"
"IME/T//AH-EE-M"
"I//^+:#/IH"
"IR//#/AH-EE-R"
"IS//%/AH-EE-S"
"IX//%/IH-K-S"
"IZ//%/AH-EE-Z"
"I//D%/AH-EE"
"I/+^/^+/IH"
"I//T%/AH-EE"
"I/#^:/^+/IH"
"I//^+/AH-EE"
"IR///AE-R"
"IGH///AH-EE"
"ILD///AH-EE-L-D"
"IGN// /AH-EE-N"
"IGN//^/AH-EE-N"
"IGN//%/AH-EE-N"
"IQUE///EE-K"
"I///IH"
"J/ / /J-A-EE"
"J///J"
"K//N/"
"K/ / /K-A-EE"
"K///K"
"LO//C#/L-OH"
"L/L//"
"L/#^:/%/UH-L"
"LEAD///L-EE-D"
"L/ / /AE-L"
"L///L"
"MOV///M-OO-V"
"M/ / /EH-M"
"M///M"
"NG/E/+/N-J"
"NG//R/N"
"NG//#/N"
"NGL//%/N-UH-L"
"NG///N"
"NK///N-K"
"NOW/ / /N-OH"
"N/ / /EH-N"
"N/N//"
"N///N"
"OF// /UH-V"
"OROUGH///AE-R-OH"
"OR/ F/TY/OH-R"
"OR/#:/ /AE-R"
"ORS/#:/ /AE-R-Z"
"OR///AW-R"
"ONE/ //W-UH-N"
"OW//EL/OH"
"OW///OH"
"OVER/ //OH-V-AE-R"
"OV///UH-V"
"O//^%/OH"
"O//^EN/OH"
"O//^I#/OH"
"OL//D/OH-L"
"OUGHT///AH-T"
"OUGH///UH-F"
"OU/ /^L/UH"
"OU/ //OH"
"OU/H/S#/OH"
"OUS///UH-S"
"OUR/ F//OH-R"
"OUR///AW-R"
"OUD///U-D"
"OUP///OO-P"
"OU///OH"
"OY///AW-EE"
"OING///OH-IH-N"
"OI///AW-EE"
"OOR///OH-R"
"OOK///U-K"
"OOD///U-D"
"OO///OO"
"O//E/OH"
"O// /OH"
"OA// /OH"
"ONLY/ //OH-N-L-EE"
"ONCE/ //W-UH-N-S"
"ON'T// /OH-N-T"
"O/C/N/AH"
"O//NG/AH"
"O/^:/N/UH"
"ON/I//UH-N"
"ON/#:/ /UH-N"
"ON/#^//UH-N"
"O//ST /OH"
"OF//^/AW-F"
"OTHER///UH-TH-AE-R"
"OSS// /AW-S"
"OM/#^:/ /UH-M"
"O///AH"
"PH///F"
"PEOP///P-EE-P"
"POW///P-OH"
"PUT// /P-U-T"
"P/ / /P-EE"
"P/P//"
"P///P"
"QUAR///K-W-AW-R"
"QU/ //K-W"
"QU///K"
"Q/ / /K-OO"
"Q///K"
"RE/ /^#/R-EE"
"R/ / /AH"
"R/R//"
"R///R"
"SH///SH"
"SION/#//ZH-UH-N"
"SOME///S-AH-M"
"SUR/#/#/ZH-AE-R"
"SUR//#/SH-AE-R"
"SU/#/#/ZH-OO"
"SSU/#/#/SH-OO"
"SED/#/ /Z-D"
"S/#/#/Z"
"SAID///S-EH-D"
"SION/^//SH-UH-N"
"S/S//"
"S/./ /Z"
"S/#:.E/ /Z"
"S/#^:##/ /Z"
"S/#^:#/ /S"
"S/U/ /S"
"S/ :#/ /Z"
"SCH/ //S-K"
"S//C+/"
"SM/#//Z-M"
"SN/#/ /Z-UH-N"
"S/ / /EH-S"
"S///S"
"THE/ / /TH-UH"
"TO// /T-OO"
"THAT///TH-AE-T"
"THIS/ / /TH-IH-S"
"THEY/ //TH-AE-A"
"THERE/ //TH-EH-R"
"THER///TH-AE-R"
"THEIR///TH-EH-EH"
"THAN/ / /TH-AE-N"
"THEM/ / /TH-EH-M"
"THESE// /TH-EE-Z"
"THEN/ //TH-EH-N"
"THROUGH///TH-R-OO"
"THOSE///TH-OH-Z"
"THOUGH// /TH-OH"
"THUS/ //TH-UH-S"
"TH///TH"
"TED/#:/ /T-IH-D"
"TI/S/#N/CH"
"TI//O/SH"
"TI//A/T"
"TIEN///SH-UH-N"
"TUR//#/CH-AE-R"
"TU//A/CH-OO"
"TWO/ //T-OO"
"T/ / /T-EE"
"T/T//"
"T///T"
"UN/ /I/Y-OO-N"
"UN/ //UH-N"
"UPON/ //UH-P-AW-N"
"UR/@/#/AE-R"
"UR//#/Y-AE-R"
"UR///AE-R"
"U//^ /UH"
"U//^^/UH"
"UY///AH-EE"
"U/ G/#/"
"U/G/%/"
"U/G/#/W"
"U/#N//Y-OO"
"UI/@//OO"
"U/@//UH"
"U///Y-OO"
"VIEW///V-Y-OO"
"V/ / /V-EE"
"V///V"
"WHERE/ //W-AE-R"
"WA//S/W-AH"
"WA//T/W-AH"
"WHERE///WH-EH-R"
"WHAT///WH-AH-T"
"WHOL///H-OH-L"
"WHO///H-OO"
"WH///WH"
"WAR///W-AH-R"
"WOR///W-AE-R"
"WR///R"
"W/ / /D-AH-B-L-Y-OO"
"W///W"
"X//^/EH-K-S"
"X/ / /EH-K-S"
"X/ /#/Z-EH"
"X///K-S"
"YOUNG///Y-UH-N"
"YOU/ //Y-OO"
"YES/ //Y-EH-S"
"Y/ / /WH-UH-Y"
"Y/ //Y"
"Y/#^:/ /EE"
"Y/#^:/I/EE"
"Y/ :/ /AH-EE"
"Y/ :/#/AH-EE"
"Y/ :/^+:#/IH"
"Y/ :/^#/AH-EE"
"Y///IH"
"ZZ///T-Z"
"Z/ / /Z-EH-D"
"Z///Z"
Turns out that lookbehind requires the pattern to be of fixed size, which doesn't fit with your rules, so we have to be a little bit more complex.
First let's define a translation between your syntax and regex:
rule_syntax = {
'#': r'[AEIOUY]+',
'+': r'[EIY]',
':': r'[BCDFGHJKLMNPQRSTVWXZ]*',
'^': r'[BCDFGHJKLMNPQRSTVWXZ]',
'.': r'[BVDGJLMNRWZ]',
'%': r'(?:ER|E|ES|ED|ING|EL)',
'&': r'(?:[SCGZXJ]|CH|SH)',
'@': r'(?:[TSRDLZNJ]|TH|CH|SH)',
}
and a function to create a regex fragment from this mapping:
def mkregex(rule):
regex = r""
for ch in rule:
regex += rule_syntax.get(ch, ch)
return regex
I'm not sure how you want to handle rules with spaces, I've commented out the ' /// '
rule to get the results below.
Now we implement a function that converts your rule syntax into an "interesting" tuple:
def mkrule(ruletxt):
txt, before, after, phoneme = ruletxt.split('/')
rule = r""
if before:
# use a non-capturing group to match the 'before' text
rule += r'(?:' + mkregex(before) + ')'
# create a capturing group for the text in question
rule += r'(?P<found>' + txt + ')'
if after:
# add a lookahead pattern
rule += r'(?=' + mkregex(after) + ')'
# return a tuple containing
# - the regex created from the rule
# - a lower-cased version of the phonemes between dashes
# - the original rule (for explaining and debugging)
return rule, "-%s-" % phoneme.lower(), ruletxt
The approach we will take is to iteratively replace matched rules with phonemes. To make sure we don't replace text that has already been converted (i.e. phonemes), we will make the input string upper cased, and the phonemes lower cased. To prevent the phonemes from running into each other we've added a -
on each side (we'll have to clean this up at the end).
Convert all your rules to interesting tuples:
rules = [mkrule(r) for r in [
#" /// ", # this rule creates problems
"A// /UH",
"ARE/ / /AH-R",
"AR/ /O/UH-R",
"AR//#/EH-R",
"AS/ ^/#/AE-A-S",
"A//WA/UH",
"AW///AW",
...
]]
We're almost there, just a function to replace the found text from a single rule:
def match_and_replace(word, rule, phonemes):
# a rule can match multiple times, find all of them
matches = [(m.start(), m.end()) for m in re.finditer(rule, word)]
matches.reverse() # we're going to replace in-place, so start from behind
chars = list(word) # convert to list of chars since strings are immutable
for start, end in matches:
chars[start:end] = phonemes
return ''.join(chars) # convert back to string
Finally, the function to extract 'phonemes' from a word:
def phonemes(word, explain=False):
# rule engines should always be able to explain their results ;-)
if explain:
print "word :", word
result = " %s " % word.upper() # add space around word to give the rules containing spaces something to work with
step = 0
# iterate over all the interesting tuples
for rule, phoneme, ruletxt in rules:
# for each rule, tmp is the string where all matches for `rule` have been replaced by `phoneme`
tmp = match_and_replace(result, rule, phoneme)
if explain and tmp != result:
step += 1
print 'step %d: %r ---> %r [rule: %r (%r)]' % (
step, result, tmp, ruletxt, rule
)
result = tmp
# remove artifacts
res, _count = re.subn(r'-+', '-', result.replace(' ', '').strip('-'))
if explain:
print "result:", res
print
return res
With this I get the following results:
>>> phonemes('abort', explain=True)
word : abort
step 1: ' ABORT ' ---> ' -ae-BORT ' [rule: 'A///AE' ('(?P<found>A)')]
step 2: ' -ae-BORT ' ---> ' -ae--b-ORT ' [rule: 'B///B' ('(?P<found>B)')]
step 3: ' -ae--b-ORT ' ---> ' -ae--b--aw-r-T ' [rule: 'OR///AW-R' ('(?P<found>OR)')]
step 4: ' -ae--b--aw-r-T ' ---> ' -ae--b--aw-r--t- ' [rule: 'T///T' ('(?P<found>T)')]
result: ae-b-aw-r-t
You'll need to order the rules sensibly to get the results you want, or use more complex algorithms that can find all possible rule-permutations that match and then find the best one.