I have a data frame that has a column containing some text.
I want to extract phrases from the text with the format NN + VB + NN
or NN + NN + VB + NN
or NN + ... + NN + VB + NN
et cetera. Basically, I want to get the simple phrases with 1 to n nouns
before the first encountered verb
, followed by a noun
.
I'm using nltk.pos_tag
after tokenizing the texts to get the tag of each word, however I cannot find a way to get what I want.
I also thought about bigrams
, trigrams
, ngrams
etc. but couldn't find a way to apply it.
Any help, please?
Here is a solution which utilises nltk.RegexParser
with a custom grammar rule to match occurrences of any numbers of nouns, followed by a verb, followed by a noun, specifically:
{<N.*>+<V.*><N.*>}
which is equivalent to,
{<NN|NNS|NNP|NNPS>+<VB|VBP|VBZ|VBG|VBD|VBN><NN|NNS|NNP|NNPS>}
Parsing "Prodikos Socrates recommended Plato, and Plato recommended Aristotle" produces the following labelled parse tree:
Output:
['Prodikos', 'Socrates', 'recommended', 'Plato']
['Plato', 'recommended', 'Aristotle']
Note: The above rule does not handle symbols and punctuation interrupting the first sequence nouns (e.g. "Prodikos, Socrates recommended Plato" will only match "Socrates recommended Plato"). There is likely a way to handle this case using some regexp
pattern and the NLTK PoS tags but it is not immediately obvious to me.
from nltk import word_tokenize, pos_tag, RegexpParser
# Text for testing
text = "Prodikos Socrates recommended Plato, and Plato recommended Aristotle"
tokenized = word_tokenize(text) # Tokenize text
tagged = pos_tag(tokenized) # Tag tokenized text with PoS tags
print(tagged)
# Output: [('Prodikos', 'NNP'), ('Socrates', 'NNP'), ('recommended', 'VBD'), ('Plato', 'NNP'), (',', ','),
# ('and', 'CC'), ('Plato', 'NNP'), ('recommended', 'VBD'), ('Aristotle', 'NNP')]
# Create custom grammar rule to label occurrences of any number of nouns, followed by a verb, followed by a noun
my_grammar = r"""
NOUNS_VERB_NOUN: {<N.*>+<V.*><N.*>}"""
# Function to create parse tree using custom grammar rules and PoS tagged text
def get_parse_tree(grammar, pos_tagged_text):
cp = RegexpParser(grammar)
parse_tree = cp.parse(pos_tagged_text)
parse_tree.draw() # Visualise parse tree
return parse_tree
# Function to get labels from custom grammar:
# takes line separated NLTK regexp grammar rules
def get_labels_from_grammar(grammar):
labels = []
for line in grammar.splitlines()[1:]:
labels.append(line.split(":")[0])
return labels
# Function takes parse tree & list of NLTK custom grammar labels as input
# Returns phrases which match
def get_phrases_using_custom_labels(parse_tree, custom_labels_to_get):
matching_phrases = []
for node in parse_tree.subtrees(filter=lambda x: any(x.label() == custom_l for custom_l in custom_labels_to_get)):
# Get phrases only, drop PoS tags
matching_phrases.append([leaf[0] for leaf in node.leaves()])
return matching_phrases
text_parse_tree = get_parse_tree(my_grammar, tagged)
my_labels = get_labels_from_grammar(my_grammar)
phrases = get_phrases_using_custom_labels(text_parse_tree, my_labels)
for phrase in phrases:
print(phrase)
# Output:
# ['Prodikos', 'Socrates', 'recommended', 'Plato']
# ['Plato', 'recommended', 'Aristotle']