python-3.xnltkgensimword2vecnltk-book

'word' not in Vocabulary in a corpus with words shown in a single list only in gensim library


Hello Community Members,

At present, I am implementing the Word2Vec algorithm.

Firstly, I have extracted the data (sentences), break and split the sentences into tokens (words), remove the punctuation marks and store the tokens in a single list. The list basically contain the words. Then I have calculated the frequency of words and then computed it occurrences in terms of frequency. It results a list.

Next, I am trying to load the model using gensim. However, I am facing a problem. The problem is about the word is not in the vocabulary. The code snippet, whatever I have tried is as follows.

import nltk, re, gensim
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from nltk.corpus import gutenberg, stopwords

def preprocessing():
    raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
    tokens = word_tokenize(raw_data)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    global words
    words = [word for word in stripped if word.isalpha()]
    sw = (stopwords.words('english'))
    sw1= (['.', ',', '"', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
    sw2= (['for', 'on', 'ed', 'es', 'ing', 'of', 'd', 'is', 'has', 'have', 'been', 'had', 'was', 'are', 'were', 'a', 'an', 'the', 't', 's', 'than', 'that', 'it', '&', 'and', 'where', 'there', 'he', 'she', 'i', 'and', 'with', 'it', 'to', 'shall', 'why', 'ham'])
    stop=sw+sw1+sw2
    words = [w for w in words if not w in stop]
preprocessing()

def freq_count():
    fd = nltk.FreqDist(words)
    print(fd.most_common())
    freq_count()
def word_embedding():
    for i in range(len(words)):
        model = Word2Vec(words, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = 4)
        model.init_sims(replace = True)
        model.save('word2vec_model')
        model = Word2Vec.load('word2vec_model')
        similarities = model.wv.most_similar('hamlet')
        for word, score in similarities:
            print(word , score)
word_embedding()

Note: I am using Python 3.7 in Windows OS. From the syntax of gensim, it is suggested to use sentences and split into tokens and apply the same to build and train the model. My question is that how to apply the same to a corpus with single list containing only words. I have specified the words also using list, i.e. [words], during the training of the model.


Solution

  • The first parameter passed to Word2Vec expects an list of sentences. You're passing a list of words

    import nltk
    import re
    import gensim
    import string
    from collections import Counter
    from string import punctuation
    from nltk.tokenize import word_tokenize
    from gensim.models import Word2Vec
    from nltk.corpus import gutenberg, stopwords
    
    
    def preprocessing():
        raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
        tokens = word_tokenize(raw_data)
        tokens = [w.lower() for w in tokens]
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        global words
        words = [word for word in stripped if word.isalpha()]
        sw = (stopwords.words('english'))
        sw1 = (['.', ',', '"', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
        sw2 = (['for', 'on', 'ed', 'es', 'ing', 'of', 'd', 'is', 'has', 'have', 'been', 'had', 'was', 'are', 'were', 'a', 'an', 'the', 't',
                's', 'than', 'that', 'it', '&', 'and', 'where', 'there', 'he', 'she', 'i', 'and', 'with', 'it', 'to', 'shall', 'why', 'ham'])
        stop = sw + sw1 + sw2
        words = [w for w in words if not w in stop]
    
    
    preprocessing()
    
    
    def freq_count():
        fd = nltk.FreqDist(words)
        print(fd.most_common())
        freq_count()
    
    
    def word_embedding():
        for i in range(len(words)):
            print(type(words))
            #pass words as a list.
            model = Word2Vec([words], size=100, sg=1, window=3,
                            min_count=1, iter=10, workers=4)
            model.init_sims(replace=True)
            model.save('word2vec_model')
            model = Word2Vec.load('word2vec_model')
            similarities = model.wv.most_similar('hamlet')
            for word, score in similarities:
                print(word, score)
    
    
    word_embedding()
    

    hope this helps :)