pythonldatopic-modelingtweets

A practical example of GSDMM in python?


I want to use GSDMM to assign topics to some tweets in my data set. The only examples I found (1 and 2) are not detailed enough. I was wondering if you know of a source (or care enough to make a small example) that shows how GSDMM is implemented using python.


Solution

  • I finally compiled my code for GSDMM and will put it here from scratch for others' use. I have tried to comment on important parts:

    # Imports
    import random
    
    import numpy as np
    from gensim.models.phrases import Phraser, Phrases
    from gensim.utils import simple_preprocess
    from gsdmm import MovieGroupProcess
    
    
    # data
    data = ...
    
    # stop words
    stop_words = ...
    
    # turning sentences into words
    data_words =[]
    for doc in data:
        doc = doc.split()
        data_words.append(doc)
    
    # create vocabulary
    vocabulary = ...
    
    # Removing stop Words
    stop_words.extend(['from', 'rt'])
    
    def remove_stopwords(texts):
        return [
            [
                word
                for word in simple_preprocess(str(doc))
                if word not in stop_words
            ]
            for doc in texts
        ]
    
    data_words_nostops = remove_stopwords(vocabulary)
    
    # building bi-grams 
    bigram = Phrases(vocabulary, min_count=5, threshold=100) 
    bigram_mod = Phraser(bigram)
    print('done!')
    
    # Form Bigrams
    data_words_bigrams = [bigram_mod[doc] for doc in data_words_nostops]
    
    # lemmatization
    pos_to_use = ['NOUN', 'ADJ', 'VERB', 'ADV']
    data_lemmatized = []
    for sent in data_words_bigrams:
        doc = nlp(" ".join(sent)) 
        data_lemmatized.append(
            [token.lemma_ for token in doc if token.pos_ in pos_to_use]
        )
          
    docs = data_lemmatized
    vocab = set(x for doc in docs for x in doc)
    
    # Train a new model 
    random.seed(1000)
    # Init of the Gibbs Sampling Dirichlet Mixture Model algorithm
    mgp = MovieGroupProcess(K=10, alpha=0.1, beta=0.1, n_iters=30)
    
    vocab = set(x for doc in docs for x in doc)
    n_terms = len(vocab)
    n_docs = len(docs)
    
    # Fit the model on the data given the chosen seeds
    y = mgp.fit(docs, n_terms)
    
    def top_words(cluster_word_distribution, top_cluster, values):
        for cluster in top_cluster:
            sort_dicts = sorted(
                mgp.cluster_word_distribution[cluster].items(),
                key=lambda k: k[1],
                reverse=True,
            )[:values]
            print('Cluster %s : %s'%(cluster,sort_dicts))
            print(' — — — — — — — — — ')
    
    doc_count = np.array(mgp.cluster_doc_count)
    print('Number of documents per topic :', doc_count)
    print('*'*20)
    
    # Topics sorted by the number of document they are allocated to
    top_index = doc_count.argsort()[-10:][::-1]
    print('Most important clusters (by number of docs inside):', top_index)
    print('*'*20)
    
    
    # Show the top 10 words in term frequency for each cluster 
    top_words(mgp.cluster_word_distribution, top_index, 10)
    
    
    

    Links

    1. gensim modules
    2. Python library gsdmm