Position of that Noun and Verb

I have a rule-based code that prints out the Noun which is followed by a verb in a sentence

for text_id, text in enumerate(news_df['news_title'].values):
    
    # Remove the comma and full stops
    text = text.replace(',', '').replace('.', '').replace('-','')
    sentence_tags = POSTAG(text.lower())
    
    print(text)
    
    # Sentences parts
    for index, part in enumerate(sentence_tags):
        try:
            
            if 'NN' in part[1] and 'VB' in sentence_tags[index + 1][1]:
            print(">", part[0])
            break
            
        elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'VB' in sentence_tags[index + 2][1]:
            print(">", part[0],  sentence_tags[index + 1][0])
            break
            
        elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'NN' in sentence_tags[index + 2][1] and 'VB' in sentence_tags[index + 3][1]:
            print(">", part[0],  sentence_tags[index + 1][0], sentence_tags[index + 2][0])
            break

        except:
            pass
    print()

The output of a sentence following this rule:

high school football players charged after video surfaces showing hazing

> school football players

trump accuser pushes new york to pass the adult survivors act plans to sue

>trump accuser

Is there a way to also print out the position of that Noun that was printed due to the rule? for example :

>trump accuser , [0,5,"NN"] , [6,13,"VB"]

Solution

I changed the script and separated the state machine segment. The most serious problem with this program IMO is it's just returning the first pattern (you can fix it quickly).

import pandas as pd
import nltk
POSTAG = nltk.pos_tag
df = pd.DataFrame({'text':['high school football players charged after video surfaces showing hazing', 'trump accuser pushes new york to pass the adult survivors act plans to sue']})
for text_id, text in enumerate(df['text'].values):
    
    # Remove the comma and full stops
    text = text.replace(',', '').replace('.', '').replace('-','')
    tokens = nltk.word_tokenize(text.lower())
    sentence_tags = POSTAG(tokens)
    words = [item[0] for item in sentence_tags]
    start_end = []
    temp = 0
    for word in words:
      start_end.append([temp, temp+len(word)])
      temp+= (len(word)+1) 
    tags = [item[1] for item in sentence_tags]
    words_to_print = []
    tags_to_print = []
    start_end_to_print = []
    # the state machine 
    verb = False
    first_noun = False
    second_noun = False
    third_noun = False
    for w, t, se in zip(words, tags, start_end):
      if t.startswith('NN'):
        words_to_print.append(w)
        tags_to_print.append(t)
        start_end_to_print.append(se)
        first_noun = True

      elif t.startswith('NN') and first_noun:
        words_to_print.append(w)
        tags_to_print.append(t)
        start_end_to_print.append(se)
        second_noun = True

      elif t.startswith('NN') and second_noun:
        words_to_print.append(w)
        tags_to_print.append(t)
        start_end_to_print.append(se)
        third_noun = True

      elif t.startswith('VB') and (first_noun or second_noun or third_noun):
        break 
      
      elif (first_noun or second_noun or third_noun):
        words_to_print = []
        tags_to_print = []
        start_end_to_print = []
        verb = False
        first_noun, second_noun, third_noun = False, False, False
    
    print('> ', ' '.join(words_to_print), ' '.join([str(item[0])+' '+str(item[1]) for item in zip(start_end_to_print, tags_to_print)]))

output:

>  school football players [5, 11] NN [12, 20] NN [21, 28] NNS
>  trump accuser [0, 5] NN [6, 13] NN