Given a NLTK tree produced using the code below, how do I retrieve the leaf values (phrases) that potentially match all of the node labels assigned using the nltk.RegexParser
(e.g. those phrases which match the Present_Indefinite
or Present_Perfect
tense)?
from nltk import word_tokenize, pos_tag
import nltk
text = "#NOVAVAX has produced the #NUVAXOVID vaccine.\
Will that provide a new rally? We see Biotechnology\
Stock $NVAX Entering the Buying Area."
tokenized = word_tokenize(text) # Tokenize text
tagged = pos_tag(tokenized) # Tag tokenized text with PoS tags
my_grammar = r"""
Future_Perfect_Continuous: {<MD><VB><VBN><VBG>}
Future_Continuous: {<MD><VB><VBG>}
Future_Perfect: {<MD><VB><VBN>}
Past_Perfect_Continuous: {<VBD><VBN><VBG>}
Present_Perfect_Continuous:{<VBP|VBZ><VBN><VBG>}
Future_Indefinite: {<MD><VB>}
Past_Continuous: {<VBD><VBG>}
Past_Perfect: {<VBD><VBN>}
Present_Continuous: {<VBZ|VBP><VBG>}
Present_Perfect: {<VBZ|VBP><VBN>}
Past_Indefinite: {<VBD>}
Present_Indefinite: {<VBZ>|<VBP>}"""
def check_grammar(grammar, tags):
cp = nltk.RegexpParser(grammar)
result = cp.parse(tags)
return result
# Apply regex parser and create parse tree
result = check_grammar(my_grammar, tagged)
print(type(result))
# Output: <class 'nltk.tree.tree.Tree'>
More specifically, given that the output of print(result)
is as shown below, how can I retrieve the phrases labelled as Present_Perfect
and Present_Indefinite
, or more generally, any other phrases which match the labels in my grammar?
(S
#/#
NOVAVAX/NNP
(Present_Perfect has/VBZ produced/VBN)
the/DT
#/#
NUVAXOVID/NNP
vaccine/NN
./.
Will/MD
that/WDT
provide/VB
a/DT
new/JJ
rally/NN
?/.
We/PRP
(Present_Indefinite see/VBP)
Biotechnology/NNP
Stock/NNP
$/$
NVAX/NNP
Entering/NNP
the/DT
Buying/NNP
Area/NNP
./.)
I've created a get_phrases_using_tense_label()
function which takes:
check_grammar()
function (I've renamed it to get_parse_tree()
as this is more meaningful in terms of what the function is doing), andThe tense labels are retrieved using the get_labels_from_grammar()
function I created, which iterates over the lines in your grammar and splits the string at the ":" retrieving the tense label.
The function then returns the list of phrases (along with their tags) for those nodes in the NLTK tree which match any of your tense_labels
(e.g. "Present_Indefinite" and Present_Perfect" in the solution below). I've used a smaller text as input as an example.
from nltk import word_tokenize, pos_tag
import nltk
text = "#NOVAVAX produces #NUVAXOVID vaccine.\
Will that provide a new rally? We see Biotechnology\
Stock $NVAX Entering the Buying Area."
# Smaller text for testing
textSmall = "We see a surge in sales. It has been a great year."
tokenized = word_tokenize(textSmall) # Tokenize text
tagged = pos_tag(tokenized) # Tag tokenized text with PoS tags
my_grammar = r"""
Future_Perfect_Continuous: {<MD><VB><VBN><VBG>}
Future_Continuous: {<MD><VB><VBG>}
Future_Perfect: {<MD><VB><VBN>}
Past_Perfect_Continuous: {<VBD><VBN><VBG>}
Present_Perfect_Continuous:{<VBP|VBZ><VBN><VBG>}
Future_Indefinite: {<MD><VB>}
Past_Continuous: {<VBD><VBG>}
Past_Perfect: {<VBD><VBN>}
Present_Continuous: {<VBZ|VBP><VBG>}
Present_Perfect: {<VBZ|VBP><VBN>}
Past_Indefinite: {<VBD>}
Present_Indefinite: {<VBZ>|<VBP>}"""
def get_parse_tree(grammar, pos_tagged_text):
cp = nltk.RegexpParser(grammar)
parse_tree = cp.parse(pos_tagged_text)
# parse_tree.draw() # Visualise parse tree
return parse_tree
# Function to get labels from grammar:
# takes line separated NLTK regexp grammar rules
def get_labels_from_grammar(grammar):
labels = []
for line in grammar.splitlines()[1:]:
labels.append(line.split(":")[0])
return labels
# Function takes parse tree & list of NLTK custom grammar labels as input
# Returns phrases which match
def get_phrases_using_tense_labels(parse_tree, tense_labels_to_get):
matching_phrases = []
for node in parse_tree.subtrees(filter=lambda x: any(x.label() == tense_lab for tense_lab in tense_labels_to_get)):
matching_phrases.append(node.leaves()[0])
return matching_phrases
# Function takes parse tree & list of NLTK custom grammar labels as input
# Returns the tense labels present in the parse tree
def get_tense_labels_in_tree(parse_tree, tense_labels_to_get):
matching_labels = []
for node in parse_tree.subtrees(filter=lambda x: any(x.label() == tense_lab for tense_lab in tense_labels_to_get)):
matching_labels.append(node.label())
return matching_labels
text_parse_tree = get_parse_tree(my_grammar, tagged)
# print(text_parse_tree) # View parse tree output
tense_labels = get_labels_from_grammar(my_grammar)
phrases = get_phrases_using_tense_labels(text_parse_tree, tense_labels)
labels = get_tense_labels_in_tree(text_parse_tree, tense_labels)
print(phrases)
# Output: [('see', 'VBP'), ('has', 'VBZ')]
print([phrase[0] for phrase in phrases])
# Output: ['see', 'has']
print(labels)
# ['Present_Perfect', 'Present_Indefinite']