Just for the sake of practising, learning and experimenting, I made a word2vec model from scratch, using the formulas and algorithm of Jurafsky & Martin. Here is the code:
class Word2Vec:
def __init__(self, context_window=2, num_dimensions=5, num_negative_samples=2, normalizer_func=None):
self.num_dimensions = num_dimensions
self.context_window = context_window
self.num_negative_samples = num_negative_samples * context_window
self.normalize = normalizer_func
self.W = None
self.V = None
def train(self, corpus, learning_rate=0.1, decay_rate=0.1, num_epochs=100):
"""
(1) Build the vocabulary from the corpus.
(2) Initialize the weight matrices.
(3) Iterate over the number of epochs.
(4) Update the weights for each training example (target word, list of context words, list of negative samples).
(5) Add the two trained matrices.
"""
self.losses = list() # List to store the value of the loss function at each epoch (for informative purposes).
self.count = 0
self.last_print = ''
self.num_epochs = num_epochs
self.build_vocabulary(corpus)
self.initialize_weights()
for epoch in range(self.num_epochs):
self.total_loss = 0.0 # For informative purposes.
learning_rate = self.get_learning_rate(learning_rate=learning_rate)
for target, context, negative_samples in self.generate_training_data(corpus):
self.update_weights(target, context, negative_samples, learning_rate)
self.losses.append(self.total_loss)
print(f"Loss at epoch {epoch}: {self.total_loss}")
self.W += self.V
def build_vocabulary(self, corpus):
"""
Create a list of unique words, where the index represents the word's idx.
"""
self.vocabulary = set()
for sentence in corpus:
for word in sentence.split():
self.vocabulary.add(word)
self.vocabulary = list(self.vocabulary)
def initialize_weights(self):
"""
Create two weight matrices, W (target words) and V (context words), with dimensions vocab_size * ndims.
The value of each dimension is randomly chosen within the range [-0.5/ndims, 0.5/ndims].
"""
vocab_size = len(self.vocabulary)
self.W = np.random.uniform(low=-0.5, high=0.5, size=(vocab_size, self.num_dimensions))
self.V = np.zeros((vocab_size, self.num_dimensions))
def generate_training_data(self, corpus):
"""
Convert each sentence in the corpus into an ordered list of words and yield the target word, context words, and negative samples.
"""
for sentence in corpus:
self.print_progress(len(corpus) * self.num_epochs)
sentence_list = sentence.strip().split()
for target_idx, target in enumerate(sentence_list):
context = self.get_context(sentence_list, target_idx)
negative_samples = self.get_negative_samples(context)
yield target, context, negative_samples
def get_context(self, sentence, target_idx):
"""
Given a list of words (sentence) and the target word's index within that list, return a list of context words.
"""
context = []
for i in range(target_idx - self.context_window, target_idx + self.context_window + 1):
if i != target_idx and i >= 0 and i < len(sentence):
context.append(sentence[i])
return context
def get_negative_samples(self, context):
"""
Select random words from the vocabulary that are not in the context words.
"""
negative_samples = []
while len(negative_samples) < self.num_negative_samples:
sample = random.choice(self.vocabulary)
if sample not in context:
negative_samples.append(sample)
return negative_samples
def update_weights(self, target, context, negative_samples, learning_rate):
"""
Iterate over the context words and negative samples.
(1) Compute the error.
(2) Add it to a total loss attribute (for later printing).
(3) Update the weight matrices.
"""
for context_word in context:
error = self.compute_error(target, context_word, 1) # 1 because it's the true label (context word)
self.total_loss += abs(np.log(error + 1.01))
self.update_weights_for_word(target, context_word, error, learning_rate)
for negative_word in negative_samples:
error = self.compute_error(target, negative_word, 0) # 0 because it's the true label (negative sample)
self.total_loss += abs(np.log(error + 1.01))
self.update_weights_for_word(target, negative_word, error, learning_rate)
def compute_error(self, target, context_word, label):
"""
For context words, the true label is 1, and the algorithm aims to approximate the dot product (to make them similar according to cosine similarity).
For negative words, the true label is 0, and the cost is higher if the prediction approaches 1.
"""
def sigmoid(x):
return 1 / (1 + np.exp(-x))
target_vector = self.W[self.vocabulary.index(target)]
context_vector = self.V[self.vocabulary.index(context_word)]
dot_product = np.dot(target_vector, context_vector)
prediction = sigmoid(dot_product)
return prediction - label
def update_weights_for_word(self, target, word, error, learning_rate):
"""
Update the weights based on the Jurfasky and Martin equation.
The gradient is equal to error * w/v.
If the error is negative (context), the vectors are summed; otherwise (negative sample), they are subtracted.
Note that in this algorithm, the update is done word by word, but it may be possible to optimize the calculations by summing the vectors of context words and negative words.
"""
self.W[self.vocabulary.index(target)] = self.W[self.vocabulary.index(target)] - learning_rate * error * self.V[self.vocabulary.index(word)]
self.V[self.vocabulary.index(word)] = self.V[self.vocabulary.index(word)] - learning_rate * error * self.W[self.vocabulary.index(target)]
def print_progress(self, len_corpus):
self.count += 1
trained = round((self.count / len_corpus) * 100)
if trained != self.last_print:
clear_output(wait=True)
print(f'{trained}% processed', flush=True)
self.last_print = trained
def get_word_vector(self, word):
if word in self.vocabulary:
return self.W[self.vocabulary.index(word)].reshape(1, -1)
else:
print(f"{word} is not in the vocabulary.")
def compute_most_similar_words(self, word, top_k=5):
word_vector = self.get_word_vector(self.normalize(word))
if word_vector is None:
return []
similarities = np.dot(self.W, word_vector.T)
top_indices = np.argsort(similarities, axis=0)[::-1][:top_k]
similar_words = [self.vocabulary[idx] for idx in top_indices.flatten() if idx != self.vocabulary.index(word)]
return similar_words
def get_learning_rate(self, learning_rate, min_value=0.0001):
if len(self.losses) < 2:
return learning_rate
if self.losses[-1] < self.losses[-2]:
return learning_rate
if self.losses[-1] > self.losses[-2]:
if learning_rate > min_value:
learning_rate = learning_rate * 0.5
print(f'Adjusting learning rate. New value: {round(learning_rate, 2)}')
return learning_rate
return learning_rate
The problem is that I cannot get a loss value lower than 1 when training with corpus of more than 20 sentences, unless I use a very low learning rate and hundreds of epochs and wait for six o seven hours (which seems too much in comparison to fasttext or solutions like these).
I know that maybe there is no easy solution, but there might be some bug or something that I'm missing which could help me improve this code. In either case, I would appreciate your insights.
Thank you in advance
Some thoughts:
Until you're training with a corpus of many thousands of unique words, & with many varied usage contexts for every word-of-interest, & with vectors of higher dimensionality (at least 10s-of-dimensions), you won't be at the scale where word2vec provides value, and at the scale where variations of parameters/data/implementation-choices teach relevant/generalizable things about about word2vec.
That is, 'toy-sized' tests will often show nothing interesting, or mislead about the relevant decisions & tradeoffs. As the only scale-of-data you've mentioned is a mere "20 sentences", and you speak favorably of your loss results at that scale, you may simply be worrying about things that don't matter for word2vec's actual effectiveness.
In particular, word2vec model overall quality is generally not evaluable by checking the training loss. That's only appropriate for determining the point at which more training can't help, when the improvement-in-loss stagnates, hinting at 'model convergence' – as good as it can get at predicting the training data, within its size/parameter constraints.
(Further: typical open-source implementations like the original word2vec.c
from Google, or the Python Gensim library, or Facebook's FastText release, don't even consult loss for that stopping decision, or even dynamic adjustment of the internal learning-rate. They instead traditionally just let users specify a fixed number of epochs, with straight-line decay of learning-rate. If training-loss is reported at all, it's only as advisory info for the operator, to guide comparisons to other runs.)
Whether a model's word-vectors – the neural-net's 'input weights' – are good at tasks is a separate matter than its internal training loss, and a model with higher final-epoch training-loss might yield better vectors for real purposes than a model with lower final-epoch training-loss.
In particular, a model that's far oversized for the training data may become incredibly good at predicting the idiosyncracies of the small training set – very low training loss – in ways that are deleterious for other out-of-training language modeling: overfitting.
So any concern that "this run's loss doesn't seem low enough" is likely misguided. Ask instead: when loss stagnated, were the word-vectors useful on robust repeatable evaluations, testing for the qualities your task(s) require?
From a quick glance at your code, some other comments:
Your exploratory implementation has none of the common optimizations of usual implementations – most especially use of optimized bulk vector operations. Thus this implementation will be far slower, and not an accurate hint as to what algorithms might be competitive for real uses. For example, when Gensim moved from a pure-Python implementation to one which used Cython code to use a BLAS library, its word2vec code ran 80X-120X faster. (Your "6 hours" thus might be "3 minutes" once code is similarly optimized.)
Words which appear only once, or a few times, in a natural-language corpus can be very numerous, by usual Zipfian frequency distributions, but nearly irrelevant to automated understanding of the texts. Yet assigning them all trainable-vectors, & having the model learn weak representations from their idiosyncratic occurrences, consumes lots of memory & training time for no benefit. So typical implementations, unlike yours, ignore all words under some chosen minimum-frequency threshold. Then training time, model size – and the measurable quality of remaining words' vectors for usual tasks! – are all much improved.
I see you're starting with a learning-rate, 0.1
, that's far higher than the usual defaults chosen by other implementations, often 0.025
or 0.05
. You're also using a momentum-based learning-rate adjustment, where open-source implementations often use simple linear decay. Your decisions might be improvements! But they aren't typical, and should ultimately be evaluated on whether they reach convergence faster, and the final word-vectors work better or worse than those from other methods – not any intuitions about what absolute training-loss values should be.
Hope this helps!