I'm trying to replicate Google's research paper on WSD with neural models using PyTorch.
I'm having some issues traying to overfit the model before training on large datasets.
Using this training set:
The film was also intended to be the first in a trilogy.
this model definition:
class WordGuesser(nn.Module):
def __init__(self, hidden_dim, context_dim, embedding_dim, vocabulary_dim, batch_dim, window_dim):
super(WordGuesser, self).__init__()
self.hidden_dim = hidden_dim
self.batch_dim = batch_dim
self.window_dim = window_dim
self.word_embeddings = nn.Embedding(vocabulary_dim, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
#self.extract_context = nn.Linear((2 * window_dim + 1) * hidden_dim, context_dim)
self.extract_context = nn.Linear(hidden_dim, context_dim)
self.predict = nn.Linear(context_dim, vocabulary_dim)
self.hidden = self.init_hidden()
def init_hidden(self):
return (autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()),
autograd.Variable(torch.zeros(1, self.batch_dim, self.hidden_dim).cuda()))
def forward(self, sentence, hidden):
embeddings = self.word_embeddings(sentence)
out, self.hidden = self.lstm(embeddings.permute(1, 0, 2), hidden)
lstm_out = out[-1]
context = self.extract_context(lstm_out)
prediction = self.predict(context)
return prediction, context
and this training routine:
num_epoch = 100
hidden_units = 512
embedding_dim = 256
context_dim = 256
def mytrain():
lines = open('training/overfit.txt').readlines()
sentences = data.split_to_sentences(lines) #uses spaCy to detect sentences from each line
word2idx=dict() #dictionary is built from the training set
idx2word =dict()
i = 0
for s in sentences:
for t in s.split(' '):
if t in word2idx:
continue
word2idx[t] = i
idx2word[i] = t
i += 1
word2idx['$'] = i #the token to guess the missing word in a sentence
idx2word[i] = '$'
X = list()
Y = list()
for sentence in sentences:
sentence = sentence.split(' ')
for i in range(len(sentence)):
newsentence = list(sentence)
newsentence[i] = '$'
if not sentence[i] in word2idx:
continue
indices = [word2idx[w] for w in newsentence]
label = word2idx[sentence[i]]
X.append(indices)
Y.append(label)
model = WordGuesser(hidden_units, context_dim, embedding_dim, len(word2idx), len(X), len(X[0]))
model.train()
model.cuda()
input = torch.LongTensor(X).cuda()
output = torch.LongTensor(Y).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
model.hidden = model.init_hidden()
for epoch in range(num_epoch):
model.hidden = model.init_hidden()
model.zero_grad()
input_tensor = autograd.Variable(input)
target_tensor = autograd.Variable(output)
predictions, context = model(input_tensor, model.hidden)
for i, prediction in enumerate(predictions):
sorted_val = sorted(enumerate(np.array(prediction.data)), key=lambda x : x[1], reverse=True)
print([(idx2word[x[0]], x[1]) for x in sorted_val[:5]], idx2word[Y[i]])
loss = criterion(predictions, target_tensor)
loss.backward()
optimizer.step()
print(epoch, loss.data[0])
torch.save(model, "train2.pt")
during the training it seems that the model is able to overfit just after the 21st epoch, as you can see from the following scores (top 5 words from the predictions and the last word in a line is the label for that sentence):
[('The', 11.362326), ('film', 11.356865), ('also', 7.5573149), ('to', 5.3518314), ('intended', 4.3520432)] The
[('film', 11.073805), ('The', 10.451499), ('also', 7.5498624), ('was', 4.9684553), ('be', 4.0730805)] film
[('was', 11.232123), ('also', 9.9741745), ('the', 6.0156212), ('be', 4.9949703), ('The', 4.5516477)] was
[('also', 9.6998224), ('was', 9.6202812), ('The', 6.345758), ('film', 4.9122157), ('be', 2.6727715)] also
[('intended', 18.344809), ('to', 16.410078), ('film', 10.147289), ('The', 9.8423424), ('$', 9.6181822)] intended
[('to', 12.442947), ('intended', 10.900065), ('film', 8.2598763), ('The', 8.0493736), ('$', 4.4901967)] to
[('be', 12.189278), ('also', 7.7172523), ('was', 7.5415096), ('the', 5.2521734), ('The', 4.1723843)] be
[('the', 15.59604), ('be', 9.3750105), ('first', 8.9820032), ('was', 8.6859236), ('also', 5.0665498)] the
[('first', 10.191225), ('the', 5.1829329), ('in', 3.6020348), ('be', 3.4108081), ('a', 1.5569853)] first
[('in', 14.731103), ('first', 9.3131113), ('a', 5.982264), ('trilogy', 4.2928643), ('be', 0.49548936)] in
[('a', 14.357709), ('in', 8.3088198), ('trilogy', 6.3918238), ('first', 6.2178354), ('intended', 0.95656234)] a
[('trilogy', 14.351434), ('a', 4.5073452), ('in', 4.2348137), ('$', 3.7552347), ('intended', 3.5101018)] trilogy
[('.', 18.152126), ('$', 12.028764), ('to', 9.6003456), ('intended', 8.1202478), ('The', 4.9225812)] .
When running another Python script which loads the model and queries it for the following words (using the same code to print out the scores during the training):
The film was also intended to $ the first in a trilogy. be
The film $ also intended to be the first in a trilogy. was
$ film was also intended to be the first in a trilogy. The
I'm getting these scores:
[('film', 24.066889), ('$', 20.107487), ('was', 16.855488), ('a', 12.969441), ('in', 8.1248817)] be
[('film', 24.089062), ('$', 20.116539), ('was', 16.891994), ('a', 12.982826), ('in', 8.1167336)] was
[('film', 23.993624), ('$', 20.108011), ('was', 16.891005), ('a', 12.960193), ('in', 8.1577587)] The
I've also tried setting to False
the model.train()
mode, using model.eval()
as well as calling topk
on the LSTM scores, but the results aren't satifying,
Solved it by saving only the model's state_dict()
via torch.save()
and then loading it back in the evaluation phase using model.load_state_dict()
.
Furthermore, I wrapped the sentence querying loop in another loop, acting as a warm-up (got it from here) and once it was at its last time looping, I set model.eval()
and printed the scores, which turned out to be correct.