Inspired by Andrej Karpathy's blog i wanted to make my own version of a recurrent neural network that selects the next word instead of character. Because of the number of different words in a text is so many, i used word2vec to represent the words as vectors (where similar words are closer in the vector-space). The NN should now train to learn the new vector from the pattern of old ones.
-one important note is that where Karpathy used a classifier, i am trying a regression method (squared loss cost).
My problem is that my neural network predicts the output [0,0,0....,0] no matter how much training. so my guess is that there is a problem in my method of training or prediction (the average error drops a little during training, so some training must be done)
below is my entire code if anyone wants to run it (it uses the brown corpus so requires installation of nltk to work as is).
This is my "Hello World" project in Lasagne, so any pointers if i do something stupid is appreciated. Thanks in advance :)
from gensim.models import Word2Vec
import gensim
import sys
from datetime import timedelta
import matplotlib.pyplot as plt
from nltk.corpus import brown
import theano.tensor as T
import theano
import time
import numpy as np
from lasagne import layers
import lasagne
from lasagne.updates import nesterov_momentum
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
def modelExcept(input, model, size):
try:
out = model[input]
return out
except Exception:
out = np.zeros((size))
print 'exception ' + str(input)
return out
def plot_TSNE(model,nr_words=None):
tsne = TSNE(n_components=2)
if nr_words == None:
X_tsne = tsne.fit_transform(model[model.wv.vocab][:])
else:
X_tsne = tsne.fit_transform(model[model.wv.vocab][0:nr_words])
X_names = [key for key in model.wv.vocab]
plt.figure()
ax = plt.subplot(111)
for i in range(X_tsne.shape[0]):
plt.text(X_tsne[i, 0], X_tsne[i, 1], str(X_names[i]),
#color=plt.cm.Set1(y[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})
plt.xticks([]), plt.yticks([])
plt.draw()
#plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
#plt.show()
def getBatch(words_as_vecs , wordSize,totalwords, windowSize, BATCHSIZE):
BatchIndexes = np.random.randint(0,totalwords-windowSize, size=BATCHSIZE)
input = np.empty((BATCHSIZE,windowSize,wordSize),dtype=np.float32)
target = np.empty((BATCHSIZE,wordSize),dtype=np.float32)
for i in range(BATCHSIZE):
k = BatchIndexes[i]
input[i,:,:] = words_as_vecs[k:k+windowSize,:]
target[i,:] = words_as_vecs[k+windowSize,:]
return input, target
wordSize = 30
windowSize = 5
BATCHSIZE = 128
LEARNING_RATE = .1
Nr_EPOCHS = 100
NR_Predictions = 15
model_raw = Word2Vec(brown.sents(),workers=4,window=10,iter=15,size=wordSize, min_count=10)
#plot_TSNE(model_raw,None)
model = model_raw.wv #trim model after training to save RAM
del model_raw
words_filtered = filter(lambda x: x in model.vocab, brown.words())#filter away words that are not in vocabulary
words_as_vecs = np.asarray([modelExcept(word, model,wordSize) for word in words_filtered],dtype = np.float32) #create all vector representations beforehand to save time!!
scaler = MinMaxScaler(feature_range=(0,1))
words_as_vecs = scaler.fit_transform(words_as_vecs)
print 'creating neural net...'
Num_units_per_layer = 512
GRAD_CLIP = 100
l_in = lasagne.layers.InputLayer(shape=(None,None,wordSize))
l_LSTM1 = lasagne.layers.LSTMLayer(l_in,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify)
l_drop1 = lasagne.layers.DropoutLayer(l_LSTM1,p=0.5)
l_LSTM2 = lasagne.layers.LSTMLayer(l_drop1,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify, only_return_final=True)
l_drop2 = lasagne.layers.DropoutLayer(l_LSTM2,p=0.5)
l_shp = lasagne.layers.ReshapeLayer(l_drop2,(-1,Num_units_per_layer))
l_out = lasagne.layers.DenseLayer(l_shp,num_units=wordSize,W=lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.rectify)
target_vals = T.imatrix('target values')
net_out = lasagne.layers.get_output(l_out)
net_out_predict = lasagne.layers.get_output(l_out,deterministic = True)
#use squared error because the problem is now a regession problem
cost = T.sum(lasagne.objectives.squared_error(net_out,target_vals))
all_params = lasagne.layers.get_all_params(l_out, trainable = True)
updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)
net_train = theano.function([l_in.input_var, target_vals], cost, updates=updates, allow_input_downcast=True)
compute_cost = theano.function([l_in.input_var, target_vals], cost, allow_input_downcast=True)
net_predict = theano.function([l_in.input_var],net_out_predict,allow_input_downcast=True)
print 'creating testphrase...'
testphrase_vectors = np.empty((1,5,wordSize),dtype=np.float32)
testphrase_vectors[0,:,:] = words_as_vecs[1:6,:]
testphrase_words = words_filtered[0:6]
#testphrase_words = brown.words()[0:6]
print 'training...'
avg_cost = 0
totalwords = len(words_filtered)
#totalwords = len(brown.words())
print_freq = totalwords/BATCHSIZE #print example every epoch
nrItterations = Nr_EPOCHS*totalwords/BATCHSIZE
for i in range(nrItterations):
inTrain, target = getBatch(words_as_vecs, wordSize, totalwords, windowSize, BATCHSIZE)
avg_cost += net_train(inTrain,target)
#generate text sample
if (i%print_freq == 0) and (i != 0):
print 'prediction of train'
print 'average cost is {0}' .format(avg_cost/(BATCHSIZE*print_freq))
avg_cost = 0
generated_example = ' '.join(testphrase_words)
testphrase_vectors_copy = testphrase_vectors
for k in range(NR_Predictions):
prediction = np.asarray(net_predict(testphrase_vectors_copy))
prediction_unscaled = scaler.inverse_transform(prediction.reshape(1,-1)).reshape(-1)
current_word = model.most_similar(positive=[prediction_unscaled], topn=1)
generated_example = ' '.join((generated_example, current_word[0][0]))
#insert new word in testphrase (and delete first)
testphrase_vectors_copy[0,0:-1,:] = testphrase_vectors_copy[0,1:,:]
testphrase_vectors_copy[0,-1,:] = model[current_word[0][0]]
#print testphrase_vectors_copy
print 'example nr. {}' .format(i/print_freq + 1)
print generated_example
print '\n \n'
I finally found the error.
The problem was this line:
target_vals = T.imatrix('target values')
which should be:
target_vals = T.fmatrix('target values')
since i'm aiming after floats and not integers.