numpymachine-learningbackpropagationsoftmaxcross-entropy

Why does my MLP model's loss explode when using softmax and cross entropy in Python?


I am writing an NLP model from scratch in Python, using only NumPy for most of the functions.

import numpy as np

# my loss and activation functions
def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(x > 0, 1, 0)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

def softmax(x):
    exp = np.exp(x)
    return exp / np.sum(exp, axis=1, keepdims=True)

def softmax_prime(x):
    return softmax(x) * (1 - softmax(x))

def cross_entropy(y, y_hat):
    return -np.sum(y * np.log(y_hat + 1e-8))

def cross_entropy_prime(y, y_hat):
    return y - y_hat

def mse(y, y_hat):
    return np.mean((y - y_hat) ** 2)

def mse_prime(y, y_hat):
    return 2 * (y_hat - y) / y.size

My NLP model uses backpropagation for weight adjustment during training. In each layer, I store the input vectors, output vectors before and after activation, and gradient vectors for easy access.

class Layer:
    def __init__(self, n_input, n_neurons, activation=relu, activation_prime=relu):
        self.weights = np.random.randn(n_input, n_neurons)
        self.biases = np.random.randn(n_neurons)
        self.activation = activation
        self.activation_prime = activation_prime

    def forward(self, inputs):
        self.inputs = inputs
        self.z = np.dot(inputs, self.weights) + self.biases
        self.output = self.activation(self.z)
        return self.output

    def backward(self, dvalues):
        self.dz = dvalues * self.activation_prime(self.z)
        self.dinputs = self.dz.dot(self.weights.T)
        self.dweights = self.inputs.T.dot(self.dz)
        self.dbiases = np.sum(self.dz, axis=0)
        return self.dinputs


    def update(self, learning_rate):
        self.weights -= self.dweights * learning_rate
        self.biases -= self.dbiases * learning_rate

class Model:
    def __init__(self):
        self.layers = []

    def add(self, layer):
        self.layers.append(layer)

    def forward(self, inputs):
        for layer in self.layers:
            inputs = layer.forward(inputs)
        return inputs

    def backward(self, dvalues):
        for layer in reversed(self.layers):
            dvalues = layer.backward(dvalues)

    def update(self, learning_rate):
        for layer in self.layers:
            layer.update(learning_rate)

    def predict(self, inputs):
        return self.forward(inputs)

    def evaluate(self, X, Y):
        predictions = self.predict(X)
        return np.mean(np.argmax(predictions, axis=1) == np.argmax(Y, axis=1))

    def compile(self, loss, loss_prime, learning_rate=0.01):
        self.loss = loss
        self.loss_prime = loss_prime
        self.learning_rate = learning_rate

    def fit(self, X, Y, epochs=100):
        loss = []
        for i in range(epochs):
            outputs = self.forward(X)
            loss.append(self.loss(Y, outputs))
            dvalues = self.loss_prime(Y, outputs)
            self.backward(dvalues)
            self.update(self.learning_rate)
            print(f"Epoch {i}: {loss[-1]}")
        return loss

I use the iris dataset from sklearn in my NLP model, where I normalize the input features and use one-hot encoding for the target labels.

from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
Y = iris.target
y = np.zeros((X.shape[0], 3))
y[np.arange(X.shape[0]), Y] = 1
Y = y

So far, the model is performing fine, but not particularly well, when I use a sigmoid activation function at the output layer and mean squared error (MSE) as the loss function.

model = Model()
model.add(Layer(4, 5))
model.add(Layer(5, 6))
model.add(Layer(6, 3, activation=sigmoid, activation_prime=sigmoid_prime))

model.compile(loss=mse, loss_prime=mse_prime, learning_rate=0.004)
loss = model.fit(X, Y, epochs=20000)
plt.plot(loss)
# Last epoch "Epoch 19999: 0.12373022229717626"

evalutaion = model.evaluate(X, Y)
print(evalutaion) # 0.6666666666666666

Link to the loss graph https://i.sstatic.net/to0Sy.png

However, when I try to use a softmax activation function at the output layer and cross entropy as the loss function, I'm not able to achieve good results.

model2 = Model()
model2.add(Layer(4, 5))
model2.add(Layer(5, 6))
model2.add(Layer(6, 3, activation=softmax, activation_prime=softmax_prime))

model2.compile(cross_entropy, cross_entropy_prime, learning_rate=0.00001)
loss = model2.fit(X, Y, epochs=300)
plt.plot(loss)
# Last epoch "Epoch 299: 1112.783115819416"

print(model2.evaluate(X, Y)) # 0.08

Link to the loss graph https://i.sstatic.net/OT7xG.png

I'm wondering if anyone can help me figure out why this is happening and how I can fix it. Thank you.

I have read many articles, mainly on Medium and Stack Exchange, and have drawn the network on paper, and derived backpropagation countless times. I have also looked at how others would use softmax and cross entropy. Thanks to those guides, my NLP model works fine with sigmoid activation and mean squared error loss. However, I am now stuck and want it to run well with softmax activation and cross-entropy loss.


Solution

  • Thank you, hobbs, for discovering this. Apparently issue was really simple: wrong sing of cross_entropy_prime function. Function should look like this:

    def cross_entropy_prime(y, y_hat):
        return y_hat - y
    

    The wrong sign was causing gradient descent to move weights and biases away from minimum, therefore, loss was exploding at some point.