I tried to build a neural network with two neurons as described in the book Why machines learn on page 330. This is my code and I don't know why it didn't work. I tried something like this before with just one neuron but now with two, I can't explain why it wont work. All formulars I used are described in the book so I think they are correct. I can list those as well if needed.
def training(x_data, labels, w11=0, w12=0, w2=0, b1=0, b2=0, alpha=0.1):
w11_list = []
w12_list = []
b1_list = []
w2_list = []
b2_list = []
L_list = []
for i in range(len(x_data)):
z1 = neuron1(x_data[i][0], x_data[i][1], w11=w11, w12=w12, b1=b1)
z2 = neuron2(z1, w2=w2, b2=b2)
yhat = a(z2) # calculate the output of the neural net
e = labels[i] - yhat # calculate error (Labels[i]=true_value)
L_list.append(e**2)
w_n2 = -2*e*a(z1)*(yhat*(1-yhat)) # formular to update weight
w2_list.append(w_n2) # added to list to take the mean
b_n2 = -2*e*(yhat*(1-yhat)) # formular to update bias
b2_list.append(b_n2) # added to list to take the mean
w1_n1 = -2*e*x_data[i][0]*w2*(yhat*(1-yhat))*(a(z1)*(1-a(z1))) #... (same as above just other weights and biases so the formular changes)
w11_list.append(w1_n1) # ...
w2_n1 = -2*e*x_data[i][1]*w2*(yhat*(1-yhat))*a(z1)*(1-a(z1)) # ...
w12_list.append(w2_n1) # ...
b_n1 = -2*e*w2*(yhat*(1-yhat))*(a(z1)*(1-a(z1))) # ...
b1_list.append(b_n1) # ...
w11_sum = sum(w11_list)/len(w11_list) # average w11 value
delta_w11 = -alpha*w11_sum # multiplied with learning_rate=alpha
w11 = w11 + delta_w11 # added to old w11 value
w12_sum = sum(w12_list)/len(w12_list) # ... (basically the same as above but different weight)
delta_w12 = -alpha*w12_sum # ...
w12 = w12 + delta_w12 # ...
b1_sum = sum(b1_list)/len(b1_list) # ...
delta_b1 = -alpha*b1_sum # ...
b1 = b1 + delta_b1 # ...
w2_sum = sum(w2_list)/len(w2_list) # ...
delta_w2 = -alpha*w2_sum # ...
w2 = w2 + delta_w2 # ...
b2_sum = sum(b2_list)/len(b2_list) # ...
delta_b2 = -alpha*b2_sum # ...
b2 = b2 + delta_b2 # ....
L_mean = sum(L_list)/len(L_list) # calculating the loss
print(L_mean)
return w11, w12, w2, b1, b2
Here are the functions I used in training().
import math
def neuron1(x1: input, x2: input, w11, w12, b1):
z = w11*x1+w12*x2+b1
y = a(z)
return y
def neuron2(y, w2, b2):
z = w2*y+b2
y = a(z)
return y
def a(z):
return 1/(1+math.e**-z)
I printed out the loss(L_mean
in the code) but it didn't change. When I created the 1-neuron-network the loss went down and the model started too learn. This has not happened and the loss remains the same. I also changed the data I am training with and used the data I created for the 1-neuon-network but my 2-neuron-network still doesn't learn at all.
Tried to change some things with ChatGPT but ChatGPT wasn't of much help.
There are a couple of issues with your code.
First of, NEVER initialize all weights to the same constant. This will prevent learning.
You're mixing outputs of the activation function and the weighted sum of inputs for each neuron. Consequently your backpropagation derivations are incorrect
The partial derivatives with respect to weights on both levels go as follows:
The remaining partial derivatives are left as an exercise for you to derive/recheck.
I've corrected your code accordingly and observe a loss drop when training a simple AND function:
import math
import random
def training(x_data, labels, w11, w12, w2, b1, b2, alpha=0.1):
w11_list = []
w12_list = []
b1_list = []
w2_list = []
b2_list = []
L_list = []
for i in range(len(x_data)):
x1, x2 = x_data[i][0], x_data[i][1]
# Foward pass
z1 = neuron1(x1, x2, w11=w11, w12=w12, b1=b1)
a1 = a(z1)
z2 = neuron2(a1, w2=w2, b2=b2)
yhat = a2 = a(z2)
# Compute error (loss)
e = labels[i] - yhat
L_list.append(e**2)
# Backward pass
w_n2 = -2 * e * yhat * (1 - yhat) * a1
w2_list.append(w_n2)
b_n2 = -2 * e * yhat * (1 - yhat)
b2_list.append(b_n2)
w1_n1 = -2 * e * yhat * (1 - yhat) * w2 * a1 * (1 - a1) * x1
w11_list.append(w1_n1)
w2_n1 = -2 * e * yhat * (1 - yhat) * w2 * a1 * (1 - a1) * x2
w12_list.append(w2_n1)
b_n1 = -2 * e * yhat * (1 - yhat) * w2 * a1 * (1 - a1)
b1_list.append(b_n1)
# Update weights & biases
w11_sum = sum(w11_list) / len(w11_list)
delta_w11 = -alpha * w11_sum
w11 = w11 + delta_w11
w12_sum = sum(w12_list) / len(w12_list)
delta_w12 = -alpha * w12_sum
w12 = w12 + delta_w12
b1_sum = sum(b1_list) / len(b1_list)
delta_b1 = -alpha * b1_sum
b1 = b1 + delta_b1
w2_sum = sum(w2_list) / len(w2_list)
delta_w2 = -alpha * w2_sum
w2 = w2 + delta_w2
b2_sum = sum(b2_list) / len(b2_list)
delta_b2 = -alpha * b2_sum
b2 = b2 + delta_b2
L_mean = sum(L_list) / len(L_list)
print(L_mean)
return w11, w12, w2, b1, b2
def neuron1(x1: input, x2: input, w11, w12, b1):
z = w11 * x1 + w12 * x2 + b1
return z
def neuron2(y, w2, b2):
z = w2 * y + b2
return z
# Sigmoid activation function
def a(z):
return 1 / (1 + math.e**-z)
##
# Simple AND example
##
data = [[0, 0], [0, 1], [1, 0], [1, 1]]
y = [0, 0, 0, 1]
w11 = random.uniform(0, 2)
w12 = random.uniform(0, 2)
w2 = random.uniform(0, 2)
b1 = b2 = 0
for i in range(20):
w11, w12, w2, b1, b2 = training(data, y, w11, w12, w2, b1, b2)
# print(w11, w12, b1, w2, b2)