rldatext2vec

How to get topic probability table from text2vec LDA


The LDA topic modeling in the text2vec package is amazing. It is indeed much faster than topicmodel

However, I don't know how to get the probability of each document belongs to each topic as the example below:

    V1  V2  V3  V4
1   0.001025237 7.89E-05    7.89E-05    7.89E-05
2   0.002906977 0.002906977 0.014534884 0.002906977
3   0.003164557 0.003164557 0.003164557 0.003164557
4   7.21E-05    7.21E-05    0.000360334 7.21E-05
5   0.000804433 8.94E-05    8.94E-05    8.94E-05
6   5.63E-05    5.63E-05    5.63E-05    5.63E-05
7   0.001984127 0.001984127 0.001984127 0.001984127
8   0.003515625 0.000390625 0.000390625 0.000390625
9   0.000748503 0.000748503 0.003742515 0.003742515
10  0.000141723 0.00297619  0.000141723 0.000708617

This is the code for text2vec lda

ss2 <- as.character(stressor5$weibo)
seg2 <- mmseg4j(ss2)


# Create vocabulary. Terms will be unigrams (simple words).
it_test = itoken(seg2, progressbar = FALSE)
vocab2 <- create_vocabulary(it_test)


pruned_vocab2 = prune_vocabulary(vocab2, 
                                 term_count_min = 10, 
                                 doc_proportion_max = 0.5,
                                 doc_proportion_min = 0.001)


vectorizer2 <- vocab_vectorizer(pruned_vocab2)

dtm_test = create_dtm(it_test, vectorizer2)


lda_model = LDA$new(n_topics = 1000, vocabulary = vocab2, doc_topic_prior = 0.1, topic_word_prior = 0.01)

doc_topic_distr = lda_model$fit_transform(dtm_test, n_iter = 1000, convergence_tol = 0.01, check_convergence_every_n = 10)

Solution

  • doc_topic_distr is a matrix which contains number of how many times words from document were assigned to particular topic. So you need just to normalize each row by number of words (also you can add doc_topic_prior before normalization).

    library(text2vec)
    data("movie_review")
    tokens = movie_review$review %>% 
      tolower %>% 
      word_tokenizer
    # turn off progressbar because it won't look nice in rmd
    it = itoken(tokens, ids = movie_review$id, progressbar = FALSE)
    v = create_vocabulary(it) %>% 
      prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.2)
    vectorizer = vocab_vectorizer(v)
    dtm = create_dtm(it, vectorizer, type = "lda_c")
    doc_topic_prior = 0.1
    lda_model = 
      LDA$new(n_topics = 10, vocabulary = v, 
              doc_topic_prior = doc_topic_prior, topic_word_prior = 0.01)
    
    doc_topic_distr = 
      lda_model$fit_transform(dtm, n_iter = 1000, convergence_tol = 0.01, 
                              check_convergence_every_n = 10)
    head(doc_topic_distr)
    #       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
    #5814_8   16   18    0   34    0   16   49    0   20    23
    #2381_9    4    0    6   20    0    0    6    6    0    28
    #7759_3   21   39    7    0    3   47    0   25   21    17
    #3630_4   18    7   22   14   19    0   18    0    2    35
    #9495_8    4    0   13   17   13   78    3    2   28    25
    #8196_8    0    0    0   11    0    8    0    8    8     0
    doc_topic_prob = normalize(doc_topic_distr, norm = "l1")
    # or add norm first and normalize :
    # doc_topic_prob = normalize(doc_topic_distr + doc_topic_prior, norm = "l1")