[SOLVED] ReLU inconsistency/randomized behaviour

ReLU inconsistency/randomized behaviour

i wrote a simple nn (it should add two numbers) and i tried different activation functions, this is my code

class Layer:
    def __init__(self):
        self.inputs = None

def forward(self, inputs):
    pass

def backward(self, error_gradient, lr):
    pass

class Dense(Layer):
    def __init__(self, n_inputs, n_neurons):
        self.weights = np.random.randn(n_neurons, n_inputs)
        self.biases = np.random.randn(n_neurons, 1)
        super().__init__()

    def forward(self, inputs):
        self.inputs = inputs
        return np.dot(self.weights, self.inputs) + self.biases

    def backward(self, error_gradient, lr):
        weight_deriv = np.dot(error_gradient, self.inputs.T)
        self.weights -= lr * weight_deriv
        self.biases -= lr * self.biases
        return np.dot(self.weights.T, error_gradient)

class Activation(Layer):
    def __init__(self, activation, actiovation_prime):
        self.activation = activation
        self.activation_prime = actiovation_prime
        super().__init__()

    def forward(self, inputs):
        self.inputs = inputs
        return self.activation(self.inputs)

    def backward(self, error_gradient, lr):
        return np.multiply(error_gradient, self.activation_prime(self.inputs))

class Tanh(Activation):
    def __init__(self):
        super().__init__(lambda x: np.tanh(x), lambda y: 1.0 - (np.tanh(y) ** 2))

class ReLU(Activation):
    def __init__(self):
        super().__init__(lambda x: np.maximum(0, x), lambda y: np.where(y > 0, 1, 0))

class Sigmoid(Activation):
    def __init__(self):
        super().__init__(lambda x: 1.0 / (1 + np.exp(-x)), lambda y: (1.0 / (1 + np.exp(-y))) * (1 - (1.0 / (1 + np.exp(-y)))))

def mse(y_pred, y_true):
    return np.power(y_true - y_pred, 2)

def mse_prime(y_pred, y_true):
    return 2 * (y_pred - y_true)

def run(nn, inputs):
    out = inputs
    for layer in nn:
        out = layer.forward(out)
    return out

and this is main

if __name__ == '__main__':
    X = np.reshape([[0.1, 0.2], [0.5, 0.3], [0.2, 0.4], [0.3, 0.7], [0.5, 0.5], [0.4, 0.3]], (6, 2, 1))
    Y = np.reshape([[0.3], [0.8], [0.6], [1.0], [1.0], [0.7]], (6, 1, 1))

    epochs, learning_rate = 5000, 0.01

    network = [
        Dense(2, 4),
        ReLU(),
        Dense(4, 4),
        ReLU(),
        Dense(4, 1),
        ReLU()
    ]

    for _ in range(epochs):
        epoch_error = 0
        for x, y in zip(X, Y):
            output = run(network, x)
            epoch_error += mse(output, y)
            output_gradient = mse_prime(output, y)
            for layer in reversed(network):
                output_gradient = layer.backward(output_gradient, learning_rate)
        epoch_error /= len(X)
        print("%d/%d, error = %f" % (_, epochs, epoch_error))

    test = np.reshape([0.1, 0.5], (2, 1))

    pred = run(network, test)

    print("Prediction = %f" % pred[0][0])

i have two questions:-

when using activations other than ReLU with learning_rate = 0.1 it takes over 100,000 epochs to get some error close to zero but still didn't reach 0 but it's consistent and the error is always going down, so first question why it takes too many epochs to solve such simple task like adding two numbers when using Sigmoid or Tanh ?
when using ReLU the error can go to 0 really fast maybe around 5000 epochs but the problem that this isn't consistent and sometimes the error never goes down, so why is that happening (i thought the problem is in weight initialization but i'am not sure) and why when it works it makes the error goes to 0 fast in compare to when using other activation functions.

Solution

The loss doesn't get to exact zero because of the vanishing gradient problem.
And sometimes the error never goes down because the weights has reached a local minimum, which is often faced by the gradient descent algorithm. Try using SGD (Stochastic Gradient Descent) where there is momentum to avoid stuck in a local minimum and tackle the problem.