pythonpandasnlpnltkn-gram

How to efficiently build ngrams based on categories in a dataframe


Problem

I have a dataframe that consists of text which belongs to a category. I now want to get the most commonly used n-grams (bigrams in the example) per category. I managed to do this, but the code for this is way too long in my opinion.

Sample Code

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Sample data
data  = {'text':['sport sport text sample sport sport text sample', 'math math text sample math math text sample', 
'politics politics text sample politics politics text sample'],
'category' : ["sport", "math", "politics"]}
df = pd.DataFrame(data)

# Get text per category
sport = [df[df['category'] == 'sport'].reset_index()['text'].iloc[0]]
math = [df[df['category'] == 'math'].reset_index()['text'].iloc[0]]
politics = [df[df['category'] == 'politics'].reset_index()['text'].iloc[0]]

# Calculate ngrams per category
n = 2

sport_ngrams = []
for sample in sport:
  sport_ngrams.extend(ngrams(nltk.word_tokenize(sample), n))
sport_ngrams_df = pd.DataFrame(pd.Series(sport_ngrams).value_counts()[:10]).reset_index()
sport_ngrams_df['category'] = 'Business & Finance'

math_ngrams = []
for sample in math:
  math_ngrams.extend(ngrams(nltk.word_tokenize(sample), n))
math_ngrams_df = pd.DataFrame(pd.Series(math_ngrams).value_counts()[:10]).reset_index()
math_ngrams_df['category'] = 'Computers & Internet'

politics_ngrams = []
for sample in politics:
  politics_ngrams.extend(ngrams(nltk.word_tokenize(sample), n))
politics_ngrams_df = pd.DataFrame(pd.Series(politics_ngrams).value_counts()[:10]).reset_index()
politics_ngrams_df['category'] = 'Education & Reference'

# Concatenate df's
bigram_df = pd.concat([sport_ngrams_df, math_ngrams_df, politics_ngrams_df
                       ]).rename(columns={"index": "word", 0:'count'})

bigram_df

Output

word count category
('sport', 'sport') 2 Business & Finance
('sport', 'text') 2 Business & Finance
('text', 'sample') 2 Business & Finance
('sample', 'sport') 1 Business & Finance
('math', 'math') 2 Computers & Internet
('math', 'text') 2 Computers & Internet
('text', 'sample') 2 Computers & Internet
('sample', 'math') 1 Computers & Internet
('politics', 'politics') 2 Education & Reference
('politics', 'text') 2 Education & Reference
('text', 'sample') 2 Education & Reference
('sample', 'politics') 1 Education & Reference

Question

Is there a more efficient way to build the n-grams where I don't have to get the text and create the n-grams per category separately?

Thank you already for the help!


Solution

  • Sure, the process for each category is identical so you can put it in a loop:

    import pandas as pd
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.util import ngrams
    
    # Sample data
    data  = {'text':['sport sport text sample sport sport text sample', 'math math text sample math math text sample', 
    'politics politics text sample politics politics text sample'],
    'category' : ["sport", "math", "politics"]}
    df = pd.DataFrame(data)
    
    n = 2
    bigram_df = pd.DataFrame()
    
    for categ in df['category']:
      text_categ = [df[df['category'] == categ].reset_index()['text'].iloc[0]]
      categ_ngrams = []
      for sample in text_categ:
        categ_ngrams.extend(ngrams(nltk.word_tokenize(sample), n))
        ngrams_df = pd.DataFrame(pd.Series(categ_ngrams).value_counts()[:10]).reset_index()
        ngrams_df['category'] = categ
        bigram_df = pd.concat([bigram_df, ngrams_df])
    
    bigram_df