I am writing an NLP model from scratch in Python, using only NumPy for most of the functions.
import numpy as np
# my loss and activation functions
def relu(x):
return np.maximum(0, x)
def relu_prime(x):
return np.where(x > 0, 1, 0)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_prime(x):
return sigmoid(x) * (1 - sigmoid(x))
def softmax(x):
exp = np.exp(x)
return exp / np.sum(exp, axis=1, keepdims=True)
def softmax_prime(x):
return softmax(x) * (1 - softmax(x))
def cross_entropy(y, y_hat):
return -np.sum(y * np.log(y_hat + 1e-8))
def cross_entropy_prime(y, y_hat):
return y - y_hat
def mse(y, y_hat):
return np.mean((y - y_hat) ** 2)
def mse_prime(y, y_hat):
return 2 * (y_hat - y) / y.size
My NLP model uses backpropagation for weight adjustment during training. In each layer, I store the input vectors, output vectors before and after activation, and gradient vectors for easy access.
class Layer:
def __init__(self, n_input, n_neurons, activation=relu, activation_prime=relu):
self.weights = np.random.randn(n_input, n_neurons)
self.biases = np.random.randn(n_neurons)
self.activation = activation
self.activation_prime = activation_prime
def forward(self, inputs):
self.inputs = inputs
self.z = np.dot(inputs, self.weights) + self.biases
self.output = self.activation(self.z)
return self.output
def backward(self, dvalues):
self.dz = dvalues * self.activation_prime(self.z)
self.dinputs = self.dz.dot(self.weights.T)
self.dweights = self.inputs.T.dot(self.dz)
self.dbiases = np.sum(self.dz, axis=0)
return self.dinputs
def update(self, learning_rate):
self.weights -= self.dweights * learning_rate
self.biases -= self.dbiases * learning_rate
class Model:
def __init__(self):
self.layers = []
def add(self, layer):
self.layers.append(layer)
def forward(self, inputs):
for layer in self.layers:
inputs = layer.forward(inputs)
return inputs
def backward(self, dvalues):
for layer in reversed(self.layers):
dvalues = layer.backward(dvalues)
def update(self, learning_rate):
for layer in self.layers:
layer.update(learning_rate)
def predict(self, inputs):
return self.forward(inputs)
def evaluate(self, X, Y):
predictions = self.predict(X)
return np.mean(np.argmax(predictions, axis=1) == np.argmax(Y, axis=1))
def compile(self, loss, loss_prime, learning_rate=0.01):
self.loss = loss
self.loss_prime = loss_prime
self.learning_rate = learning_rate
def fit(self, X, Y, epochs=100):
loss = []
for i in range(epochs):
outputs = self.forward(X)
loss.append(self.loss(Y, outputs))
dvalues = self.loss_prime(Y, outputs)
self.backward(dvalues)
self.update(self.learning_rate)
print(f"Epoch {i}: {loss[-1]}")
return loss
I use the iris dataset from sklearn in my NLP model, where I normalize the input features and use one-hot encoding for the target labels.
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
Y = iris.target
y = np.zeros((X.shape[0], 3))
y[np.arange(X.shape[0]), Y] = 1
Y = y
So far, the model is performing fine, but not particularly well, when I use a sigmoid activation function at the output layer and mean squared error (MSE) as the loss function.
model = Model()
model.add(Layer(4, 5))
model.add(Layer(5, 6))
model.add(Layer(6, 3, activation=sigmoid, activation_prime=sigmoid_prime))
model.compile(loss=mse, loss_prime=mse_prime, learning_rate=0.004)
loss = model.fit(X, Y, epochs=20000)
plt.plot(loss)
# Last epoch "Epoch 19999: 0.12373022229717626"
evalutaion = model.evaluate(X, Y)
print(evalutaion) # 0.6666666666666666
Link to the loss graph https://i.sstatic.net/to0Sy.png
However, when I try to use a softmax activation function at the output layer and cross entropy as the loss function, I'm not able to achieve good results.
model2 = Model()
model2.add(Layer(4, 5))
model2.add(Layer(5, 6))
model2.add(Layer(6, 3, activation=softmax, activation_prime=softmax_prime))
model2.compile(cross_entropy, cross_entropy_prime, learning_rate=0.00001)
loss = model2.fit(X, Y, epochs=300)
plt.plot(loss)
# Last epoch "Epoch 299: 1112.783115819416"
print(model2.evaluate(X, Y)) # 0.08
Link to the loss graph https://i.sstatic.net/OT7xG.png
I'm wondering if anyone can help me figure out why this is happening and how I can fix it. Thank you.
I have read many articles, mainly on Medium and Stack Exchange, and have drawn the network on paper, and derived backpropagation countless times. I have also looked at how others would use softmax and cross entropy. Thanks to those guides, my NLP model works fine with sigmoid activation and mean squared error loss. However, I am now stuck and want it to run well with softmax activation and cross-entropy loss.
Thank you, hobbs, for discovering this.
Apparently issue was really simple: wrong sing of cross_entropy_prime
function. Function should look like this:
def cross_entropy_prime(y, y_hat):
return y_hat - y
The wrong sign was causing gradient descent to move weights and biases away from minimum, therefore, loss was exploding at some point.