pythonnlpnltkpart-of-speechparse-tree

How do I retrieve phrases from a NLTK.tree using custom node labels?


Given a NLTK tree produced using the code below, how do I retrieve the leaf values (phrases) that potentially match all of the node labels assigned using the nltk.RegexParser (e.g. those phrases which match the Present_Indefinite or Present_Perfect tense)?

from nltk import word_tokenize, pos_tag
import nltk

text = "#NOVAVAX has produced the #NUVAXOVID vaccine.\
 Will that provide a new rally? We see Biotechnology\
  Stock $NVAX Entering the Buying Area."
tokenized = word_tokenize(text) # Tokenize text
tagged = pos_tag(tokenized) # Tag tokenized text with PoS tags

my_grammar = r"""
Future_Perfect_Continuous: {<MD><VB><VBN><VBG>}
Future_Continuous:         {<MD><VB><VBG>}
Future_Perfect:            {<MD><VB><VBN>}
Past_Perfect_Continuous:   {<VBD><VBN><VBG>}
Present_Perfect_Continuous:{<VBP|VBZ><VBN><VBG>}
Future_Indefinite:         {<MD><VB>}
Past_Continuous:           {<VBD><VBG>}
Past_Perfect:              {<VBD><VBN>}
Present_Continuous:        {<VBZ|VBP><VBG>}
Present_Perfect:           {<VBZ|VBP><VBN>}
Past_Indefinite:           {<VBD>}
Present_Indefinite:        {<VBZ>|<VBP>}"""


def check_grammar(grammar, tags):
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(tags)
    return result

# Apply regex parser and create parse tree
result = check_grammar(my_grammar, tagged)
print(type(result))
# Output: <class 'nltk.tree.tree.Tree'>

More specifically, given that the output of print(result) is as shown below, how can I retrieve the phrases labelled as Present_Perfect and Present_Indefinite, or more generally, any other phrases which match the labels in my grammar?

(S
  #/#
  NOVAVAX/NNP
  (Present_Perfect has/VBZ produced/VBN)
  the/DT
  #/#
  NUVAXOVID/NNP
  vaccine/NN
  ./.
  Will/MD
  that/WDT
  provide/VB
  a/DT
  new/JJ
  rally/NN
  ?/.
  We/PRP
  (Present_Indefinite see/VBP)
  Biotechnology/NNP
  Stock/NNP
  $/$
  NVAX/NNP
  Entering/NNP
  the/DT
  Buying/NNP
  Area/NNP
  ./.)

Solution

  • I've created a get_phrases_using_tense_label() function which takes:

    The tense labels are retrieved using the get_labels_from_grammar() function I created, which iterates over the lines in your grammar and splits the string at the ":" retrieving the tense label.

    The function then returns the list of phrases (along with their tags) for those nodes in the NLTK tree which match any of your tense_labels (e.g. "Present_Indefinite" and Present_Perfect" in the solution below). I've used a smaller text as input as an example.

    Parse tree with multiple tense labels

    Solution

    from nltk import word_tokenize, pos_tag
    import nltk
    
    text = "#NOVAVAX produces #NUVAXOVID vaccine.\
     Will that provide a new rally? We see Biotechnology\
      Stock $NVAX Entering the Buying Area."
    
    # Smaller text for testing
    textSmall = "We see a surge in sales. It has been a great year."
    
    tokenized = word_tokenize(textSmall)  # Tokenize text
    tagged = pos_tag(tokenized)  # Tag tokenized text with PoS tags
    
    my_grammar = r"""
    Future_Perfect_Continuous: {<MD><VB><VBN><VBG>}
    Future_Continuous:         {<MD><VB><VBG>}
    Future_Perfect:            {<MD><VB><VBN>}
    Past_Perfect_Continuous:   {<VBD><VBN><VBG>}
    Present_Perfect_Continuous:{<VBP|VBZ><VBN><VBG>}
    Future_Indefinite:         {<MD><VB>}
    Past_Continuous:           {<VBD><VBG>}
    Past_Perfect:              {<VBD><VBN>}
    Present_Continuous:        {<VBZ|VBP><VBG>}
    Present_Perfect:           {<VBZ|VBP><VBN>}
    Past_Indefinite:           {<VBD>}
    Present_Indefinite:        {<VBZ>|<VBP>}"""
    
    
    def get_parse_tree(grammar, pos_tagged_text):
        cp = nltk.RegexpParser(grammar)
        parse_tree = cp.parse(pos_tagged_text)
        # parse_tree.draw()  # Visualise parse tree
        return parse_tree
    
    
    # Function to get labels from grammar:
    # takes line separated NLTK regexp grammar rules
    def get_labels_from_grammar(grammar):
        labels = []
        for line in grammar.splitlines()[1:]:
            labels.append(line.split(":")[0])
        return labels
    
    
    # Function takes parse tree & list of NLTK custom grammar labels as input
    # Returns phrases which match
    def get_phrases_using_tense_labels(parse_tree, tense_labels_to_get):
        matching_phrases = []
        for node in parse_tree.subtrees(filter=lambda x: any(x.label() == tense_lab for tense_lab in tense_labels_to_get)):
            matching_phrases.append(node.leaves()[0])
        return matching_phrases
    
    
    # Function takes parse tree & list of NLTK custom grammar labels as input
    # Returns the tense labels present in the parse tree
    def get_tense_labels_in_tree(parse_tree, tense_labels_to_get):
        matching_labels = []
        for node in parse_tree.subtrees(filter=lambda x: any(x.label() == tense_lab for tense_lab in tense_labels_to_get)):
            matching_labels.append(node.label())
        return matching_labels
    
    
    text_parse_tree = get_parse_tree(my_grammar, tagged)
    # print(text_parse_tree)  # View parse tree output
    tense_labels = get_labels_from_grammar(my_grammar)
    phrases = get_phrases_using_tense_labels(text_parse_tree, tense_labels)
    labels = get_tense_labels_in_tree(text_parse_tree, tense_labels)
    
    print(phrases)
    # Output: [('see', 'VBP'), ('has', 'VBZ')]
    print([phrase[0] for phrase in phrases])
    # Output: ['see', 'has']
    print(labels)
    # ['Present_Perfect', 'Present_Indefinite']