i wrote a simple nn (it should add two numbers) and i tried different activation functions, this is my code
class Layer:
def __init__(self):
self.inputs = None
def forward(self, inputs):
pass
def backward(self, error_gradient, lr):
pass
class Dense(Layer):
def __init__(self, n_inputs, n_neurons):
self.weights = np.random.randn(n_neurons, n_inputs)
self.biases = np.random.randn(n_neurons, 1)
super().__init__()
def forward(self, inputs):
self.inputs = inputs
return np.dot(self.weights, self.inputs) + self.biases
def backward(self, error_gradient, lr):
weight_deriv = np.dot(error_gradient, self.inputs.T)
self.weights -= lr * weight_deriv
self.biases -= lr * self.biases
return np.dot(self.weights.T, error_gradient)
class Activation(Layer):
def __init__(self, activation, actiovation_prime):
self.activation = activation
self.activation_prime = actiovation_prime
super().__init__()
def forward(self, inputs):
self.inputs = inputs
return self.activation(self.inputs)
def backward(self, error_gradient, lr):
return np.multiply(error_gradient, self.activation_prime(self.inputs))
class Tanh(Activation):
def __init__(self):
super().__init__(lambda x: np.tanh(x), lambda y: 1.0 - (np.tanh(y) ** 2))
class ReLU(Activation):
def __init__(self):
super().__init__(lambda x: np.maximum(0, x), lambda y: np.where(y > 0, 1, 0))
class Sigmoid(Activation):
def __init__(self):
super().__init__(lambda x: 1.0 / (1 + np.exp(-x)), lambda y: (1.0 / (1 + np.exp(-y))) * (1 - (1.0 / (1 + np.exp(-y)))))
def mse(y_pred, y_true):
return np.power(y_true - y_pred, 2)
def mse_prime(y_pred, y_true):
return 2 * (y_pred - y_true)
def run(nn, inputs):
out = inputs
for layer in nn:
out = layer.forward(out)
return out
and this is main
if __name__ == '__main__':
X = np.reshape([[0.1, 0.2], [0.5, 0.3], [0.2, 0.4], [0.3, 0.7], [0.5, 0.5], [0.4, 0.3]], (6, 2, 1))
Y = np.reshape([[0.3], [0.8], [0.6], [1.0], [1.0], [0.7]], (6, 1, 1))
epochs, learning_rate = 5000, 0.01
network = [
Dense(2, 4),
ReLU(),
Dense(4, 4),
ReLU(),
Dense(4, 1),
ReLU()
]
for _ in range(epochs):
epoch_error = 0
for x, y in zip(X, Y):
output = run(network, x)
epoch_error += mse(output, y)
output_gradient = mse_prime(output, y)
for layer in reversed(network):
output_gradient = layer.backward(output_gradient, learning_rate)
epoch_error /= len(X)
print("%d/%d, error = %f" % (_, epochs, epoch_error))
test = np.reshape([0.1, 0.5], (2, 1))
pred = run(network, test)
print("Prediction = %f" % pred[0][0])
i have two questions:-
when using activations other than ReLU with learning_rate = 0.1 it takes over 100,000 epochs to get some error close to zero but still didn't reach 0 but it's consistent and the error is always going down, so first question why it takes too many epochs to solve such simple task like adding two numbers when using Sigmoid or Tanh ?
when using ReLU the error can go to 0 really fast maybe around 5000 epochs but the problem that this isn't consistent and sometimes the error never goes down, so why is that happening (i thought the problem is in weight initialization but i'am not sure) and why when it works it makes the error goes to 0 fast in compare to when using other activation functions.