python jupyter-notebook gensim topic-modeling mallet

How to predict test data on Gensim Topic modelling

I have used Gensim LDAMallet for topic modelling but in what way we can predict sample paragraph and get their topic model using pretrained model.

# Build the bigram and trigram models
bigram = gensim.models.Phrases(t_preprocess(dataset.data), min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram) 

def make_bigrams(texts):
   return [bigram_mod[doc] for doc in texts]

data_words_bigrams = make_bigrams(t_preprocess(dataset.data))

# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

mallet_path='/home/riteshjain/anaconda3/mallet/mallet2.0.8/bin/mallet' 
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,corpus=corpus, num_topics=12, id2word=id2word, random_seed = 0)

coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=texts, dictionary=id2word, coherence='c_v')

a = "When Honda builds a hybrid, you've got to be sure it‚Äôs a marvel. And an Accord Hybrid is when technology surpasses the known and takes a leap of faith into tomorrow. This is the next generation Accord, the ninth generation to be precise."

How to use this text (a) to get its topic from the pretrained model. Please help.

Solution

You're going to want to process 'a' similarly to the trained set:

# import a new data set to be passed through the pre-trained LDA

data_new = pd.read_csv('YourNew.csv', encoding = "ISO-8859-1");
data_new = data_new.dropna()
data_text_new = data_new[['Your Target Column']]
data_text_new['index'] = data_text_new.index

documents_new = data_text_new

# process the new data set through the lemmatization, and stopwork functions

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            nltk.bigrams(token)
            result.append(lemmatize_stemming(token))
    return result

processed_docs_new = documents_new['Your Target Column'].map(preprocess)

# create a dictionary of individual words and filter the dictionary
dictionary_new = gensim.corpora.Dictionary(processed_docs_new[:])
dictionary_new.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# define the bow_corpus
bow_corpus_new = [dictionary_new.doc2bow(doc) for doc in processed_docs_new]

Then you can just pass it through as a function:

a = ldamallet[bow_corpus_new[:len(bow_corpus_new)]]
b = data_text_new

topic_0=[]
topic_1=[]
topic_2=[]

for i in a:
    topic_0.append(i[0][1])
    topic_1.append(i[1][1])
    topic_2.append(i[2][1])
    
d = {'Your Target Column': b['Your Target Column'].tolist(),
     'topic_0': topic_0,
     'topic_1': topic_1,
     'topic_2': topic_2}
     
df = pd.DataFrame(data=d)
df.to_csv("YourAllocated.csv", index=True, mode = 'a')

I hope this helps :)