Hello Community Members,
At present, I am implementing the Word2Vec algorithm.
Firstly, I have extracted the data (sentences), break and split the sentences into tokens (words), remove the punctuation marks and store the tokens in a single list. The list basically contain the words. Then I have calculated the frequency of words and then computed it occurrences in terms of frequency. It results a list.
Next, I am trying to load the model using gensim. However, I am facing a problem. The problem is about the word is not in the vocabulary
. The code snippet, whatever I have tried is as follows.
import nltk, re, gensim
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from nltk.corpus import gutenberg, stopwords
def preprocessing():
raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
tokens = word_tokenize(raw_data)
tokens = [w.lower() for w in tokens]
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
global words
words = [word for word in stripped if word.isalpha()]
sw = (stopwords.words('english'))
sw1= (['.', ',', '"', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
sw2= (['for', 'on', 'ed', 'es', 'ing', 'of', 'd', 'is', 'has', 'have', 'been', 'had', 'was', 'are', 'were', 'a', 'an', 'the', 't', 's', 'than', 'that', 'it', '&', 'and', 'where', 'there', 'he', 'she', 'i', 'and', 'with', 'it', 'to', 'shall', 'why', 'ham'])
stop=sw+sw1+sw2
words = [w for w in words if not w in stop]
preprocessing()
def freq_count():
fd = nltk.FreqDist(words)
print(fd.most_common())
freq_count()
def word_embedding():
for i in range(len(words)):
model = Word2Vec(words, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = 4)
model.init_sims(replace = True)
model.save('word2vec_model')
model = Word2Vec.load('word2vec_model')
similarities = model.wv.most_similar('hamlet')
for word, score in similarities:
print(word , score)
word_embedding()
Note: I am using Python 3.7 in Windows OS. From the syntax of gensim
, it is suggested to use sentences and split into tokens and apply the same to build and train the model. My question is that how to apply the same to a corpus with single list containing only words. I have specified the words also using list, i.e. [words], during the training of the model.
The first parameter passed to Word2Vec
expects an list of sentences. You're passing a list of words
import nltk
import re
import gensim
import string
from collections import Counter
from string import punctuation
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from nltk.corpus import gutenberg, stopwords
def preprocessing():
raw_data = (gutenberg.raw('shakespeare-hamlet.txt'))
tokens = word_tokenize(raw_data)
tokens = [w.lower() for w in tokens]
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
global words
words = [word for word in stripped if word.isalpha()]
sw = (stopwords.words('english'))
sw1 = (['.', ',', '"', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])
sw2 = (['for', 'on', 'ed', 'es', 'ing', 'of', 'd', 'is', 'has', 'have', 'been', 'had', 'was', 'are', 'were', 'a', 'an', 'the', 't',
's', 'than', 'that', 'it', '&', 'and', 'where', 'there', 'he', 'she', 'i', 'and', 'with', 'it', 'to', 'shall', 'why', 'ham'])
stop = sw + sw1 + sw2
words = [w for w in words if not w in stop]
preprocessing()
def freq_count():
fd = nltk.FreqDist(words)
print(fd.most_common())
freq_count()
def word_embedding():
for i in range(len(words)):
print(type(words))
#pass words as a list.
model = Word2Vec([words], size=100, sg=1, window=3,
min_count=1, iter=10, workers=4)
model.init_sims(replace=True)
model.save('word2vec_model')
model = Word2Vec.load('word2vec_model')
similarities = model.wv.most_similar('hamlet')
for word, score in similarities:
print(word, score)
word_embedding()
hope this helps :)