pythonnlpnltkpos-tagger

Position of that Noun and Verb


I have a rule-based code that prints out the Noun which is followed by a verb in a sentence

for text_id, text in enumerate(news_df['news_title'].values):
    
    # Remove the comma and full stops
    text = text.replace(',', '').replace('.', '').replace('-','')
    sentence_tags = POSTAG(text.lower())
    
    print(text)
    
    # Sentences parts
    for index, part in enumerate(sentence_tags):
        try:
            
            if 'NN' in part[1] and 'VB' in sentence_tags[index + 1][1]:
            print(">", part[0])
            break
            
        elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'VB' in sentence_tags[index + 2][1]:
            print(">", part[0],  sentence_tags[index + 1][0])
            break
            
        elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'NN' in sentence_tags[index + 2][1] and 'VB' in sentence_tags[index + 3][1]:
            print(">", part[0],  sentence_tags[index + 1][0], sentence_tags[index + 2][0])
            break

        except:
            pass
    print()

The output of a sentence following this rule:

high school football players charged after video surfaces showing hazing

> school football players

trump accuser pushes new york to pass the adult survivors act plans to sue

>trump accuser

Is there a way to also print out the position of that Noun that was printed due to the rule? for example :

>trump accuser , [0,5,"NN"] , [6,13,"VB"]

Solution

  • I changed the script and separated the state machine segment. The most serious problem with this program IMO is it's just returning the first pattern (you can fix it quickly).

    import pandas as pd
    import nltk
    POSTAG = nltk.pos_tag
    df = pd.DataFrame({'text':['high school football players charged after video surfaces showing hazing', 'trump accuser pushes new york to pass the adult survivors act plans to sue']})
    for text_id, text in enumerate(df['text'].values):
        
        # Remove the comma and full stops
        text = text.replace(',', '').replace('.', '').replace('-','')
        tokens = nltk.word_tokenize(text.lower())
        sentence_tags = POSTAG(tokens)
        words = [item[0] for item in sentence_tags]
        start_end = []
        temp = 0
        for word in words:
          start_end.append([temp, temp+len(word)])
          temp+= (len(word)+1) 
        tags = [item[1] for item in sentence_tags]
        words_to_print = []
        tags_to_print = []
        start_end_to_print = []
        # the state machine 
        verb = False
        first_noun = False
        second_noun = False
        third_noun = False
        for w, t, se in zip(words, tags, start_end):
          if t.startswith('NN'):
            words_to_print.append(w)
            tags_to_print.append(t)
            start_end_to_print.append(se)
            first_noun = True
    
          elif t.startswith('NN') and first_noun:
            words_to_print.append(w)
            tags_to_print.append(t)
            start_end_to_print.append(se)
            second_noun = True
    
          elif t.startswith('NN') and second_noun:
            words_to_print.append(w)
            tags_to_print.append(t)
            start_end_to_print.append(se)
            third_noun = True
    
          elif t.startswith('VB') and (first_noun or second_noun or third_noun):
            break 
          
          elif (first_noun or second_noun or third_noun):
            words_to_print = []
            tags_to_print = []
            start_end_to_print = []
            verb = False
            first_noun, second_noun, third_noun = False, False, False
        
        print('> ', ' '.join(words_to_print), ' '.join([str(item[0])+' '+str(item[1]) for item in zip(start_end_to_print, tags_to_print)]))   
          
    

    output:

    >  school football players [5, 11] NN [12, 20] NN [21, 28] NNS
    >  trump accuser [0, 5] NN [6, 13] NN