I am developing neural network from scratch. It consists of of the following input > layer1(sigmoid) > layer2 > output(softmax). The basic coding is complete but when I ran it, I obtain a weird loss plot. Also if I ran the code for large iterations I obtain 'nan' in the output. Any help is appreciated.
I spent a bit of time going through the derivations and checking my implementation but cannot spot what the issue is. See my derivations attached, I am interested in the softmax loss and sorry about the writing. derivations link https://drive.google.com/file/d/1VBSO8Ox6U3Vo9FvgT8gKUBZPEEEjzMS_/view?usp=drivesdk
code
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn as sk
from sklearn import preprocessing
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt
(x_train,y_train),(x_test,y_test)=mnist.load_data()
n_train = 100
x_train = x_train[0:n_train].reshape(-1,784)/255
y_train0 = y_train[0:n_train].reshape(-1,)
y_train1 = pd.get_dummies(y_train0)
y_train = np.array( y_train1.astype(int) )
nx = 784 #input size (nx,1)
n1 = 20 #neurons in 1st layer
n_class = 10 #neurons in 2nd layer
lambda0 = 0.00 #center loss parameter
alpha = 0.0001 #gradient decent parameter
#layer weights
#The network is simple
#Input, n1 neurons with sigmoid activation, 10 neurons and softmax output
W1 = 0.001*np.random.rand( n1, nx )
W2 = 0.001*np.random.rand( n_class, n1 )
b2 = 0.001*np.random.rand( n_class,1 )
b1 = 0.001*np.random.rand( n1,1 )
centers = np.random.rand(n_class,1)
def softmax(x):
exp_sum = np.sum( np.exp(x) )
return np.exp(x)/( exp_sum )
def sigmoid(x):
y = 1/(1+ np.exp(-x) )
return y
def test():
count = 0
for t3 in range(0,100):
#forward propagation #Layer 1
Z1 = np.dot( W1 , x_train[t3]).reshape(-1,1)
A1 = sigmoid( Z1 ).reshape(-1,1)
#forward propagation #Layer 2
Z2 = np.dot( W2, A1) + b2
Output = softmax(Z2)
arg_max = np.argmax(Output)
if arg_max == np.argmax(y_train[t3:t3+1] ):
count = count +1
print(count/100)
def new_center(x,y0,c0):
c = np.zeros( (n_class, 1) )
y = np.argmax(y0, axis=-1) #convert to one column array
for i in range(n_class):
indx = np.where(y == i)[0] #choose all points that equal class i
a1 = np.mean( x[ [indx] ].reshape(-1,n_class) )
c[i,:] = a1
return c-c0
iter = 50000
sets = 100
loss_temp = np.zeros(( sets ,1))
loss = []
Zi_vector = np.zeros(( sets ,n_class))
y_pred_vector = np.zeros(( sets ,n_class))
loss = np.zeros(( iter,1))
print_counter = 0
for t1 in range (0,iter):
for t2 in range(0,sets):
#forward propagation #Layer 1 & 2
Z1 = np.dot( W1 , x_train[t2:t2+1].T ).reshape(-1,1) + b1
A1 = sigmoid( Z1 ).reshape(-1,1)
#forward propagation #Layer 2
Z2 = np.dot( W2, A1) + b2
Output = softmax(Z2)
#back propagation #Layer 2
dy = Output - y_train[t2:t2+1].T #+ lambda0*(Z2- centers)
dE_dZ2 = dy
dE_dW2 = dy*A1.T
dE_db2 = dy
W2T = W2.T
#backpropagation #Layer 1
dE_dZ2T = np.zeros((n1,1))
dA1_dZ1 = Z1*(1-Z1)
for temp1 in range(0,n1):
dE_dZ2T[temp1] = np.dot( dy.T, W2T[temp1].T )
dE_dZ2T__dA1_dZ1 = dE_dZ2T*dA1_dZ1
dE_dW1 = dE_dZ2T__dA1_dZ1 * x_train[t2].T
# #For regularization
# L2_W2 = W2*0.001
# L2_W1 = W1*0.001
# L2_b2 = b2*0.001
# L2_b1 = b1*0.001
#weight update
W1 = W1 - alpha*(dE_dW1 )
W2 = W2 - alpha*( dE_dW2 )
b2 = b2 - alpha*(dy )
b1 = b1 - alpha*( dE_dZ2T__dA1_dZ1 )
loss_temp[t2] = ( -np.sum( y_train[t2:t2+1]*np.log10(Output.T) ) )
Zi_vector[t2] = Z2.reshape(-1,)
y_pred_vector[t2,:] = Output.reshape(-1,n_class)
#centers = new_center(Zi_vector,y_train, centers)
loss[t1] = ( np.mean(loss_temp))
loss_temp = np.zeros(( 100 ,1))
print_counter = print_counter + 1
if print_counter > 100:
print(t1)
print_counter = 1
test()
plt.plot(loss )
plt.xlabel('iterations')
plt.ylabel('loss')
plt.show()
I am not sure what fixed it but, I 1) scaled the the inputs into softmax function and 2) rewrote the weight update for W2.
The loss becomes zero after 10000 iterations also the accuracy becomes 1 which is what I was expecting for an overfitted model trained on few data.
code
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn as sk
from sklearn import preprocessing
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt
(x_train,y_train),(x_test,y_test)=mnist.load_data()
n_train = 100
x_train = x_train[0:n_train].reshape(-1,784)/255
y_train0 = y_train[0:n_train].reshape(-1,)
y_train1 = pd.get_dummies(y_train0)
y_train = np.array( y_train1.astype(int) )
nx = 784 #input size (nx,1)
n1 = 20 #neurons in 1st layer
n_class = 10 #neurons in 2nd layer
lambda0 = 0.00 #center loss parameter
alpha = 0.0001 #gradient decent parameter
#layer weights
#The network is simple
#Input, n1 neurons with sigmoid activation, 10 neurons and softmax output
W1 = 0.001*np.random.rand( n1, nx )
W2 = 0.001*np.random.rand( n_class, n1 )
b2 = 0.001*np.random.rand( n_class,1 )
b1 = 0.001*np.random.rand( n1,1 )
centers = np.random.rand(n_class,1)
def softmax(x):
xmin = np.min(x)
xmax = np.max(x)
b = 100
a = 1
xnormalized = (b-a)*(x-xmin)/(0.001 +xmax - xmin) + a
exp_sum = np.sum( np.exp( xnormalized ) )
return np.exp( xnormalized )/( exp_sum )
def sigmoid(x):
y = 1/(1+ np.exp(-x) )
return y
def test():
count = 0
for t3 in range(0,100):
#forward propagation #Layer 1
Z1 = np.dot( W1 , x_train[t3]).reshape(-1,1)
A1 = sigmoid( Z1 ).reshape(-1,1)
#forward propagation #Layer 2
Z2 = np.dot( W2, A1) + b2
Output = softmax(Z2)
arg_max = np.argmax(Output)
if arg_max == np.argmax(y_train[t3:t3+1] ):
count = count +1
print(count/100)
def new_center(x,y0,c0):
c = np.zeros( (n_class, 1) )
y = np.argmax(y0, axis=-1) #convert to one column array
for i in range(n_class):
indx = np.where(y == i)[0] #choose all points that equal class i
a1 = np.mean( x[ [indx] ].reshape(-1,n_class) )
c[i,:] = a1
return c-c0
iter = 20000
sets = 100
loss_temp = np.zeros(( sets ,1))
loss = []
Zi_vector = np.zeros(( sets ,n_class))
y_pred_vector = np.zeros(( sets ,n_class))
loss = np.zeros(( iter,1))
print_counter = 0
for t1 in range (0,iter):
for t2 in range(0,sets):
#forward propagation #Layer 1 & 2
Z1 = np.dot( W1 , x_train[t2:t2+1].T ).reshape(-1,1) + b1
A1 = sigmoid( Z1 ).reshape(-1,1)
#forward propagation #Layer 2
Z2 = np.dot( W2, A1) + b2
Output = softmax(Z2)
#back propagation #Layer 2
dy = Output - y_train[t2:t2+1].T #+ lambda0*(Z2- centers)
dE_dZ2 = dy
dE_dW2 = dy*A1.T
dE_db2 = dy
W2 = W2
#backpropagation #Layer 1
dE_dZ2T = np.zeros((n1,1))
dA1_dZ1 = Z1*(1-Z1)
# for temp1 in range(0,n1):
# dE_dZ2T[temp1] = np.dot( dy.T, W2[temp1].T )
dE_dZ2T__dA1_dZ1 = np.dot(W2.T, dy) * dA1_dZ1
dE_dW1 = dE_dZ2T__dA1_dZ1 * x_train[t2].T
# #For regularization
# L2_W2 = W2*0.001
# L2_W1 = W1*0.001
# L2_b2 = b2*0.001
# L2_b1 = b1*0.001
#weight update
W1 = W1 - alpha*(dE_dW1 )
W2 = W2 - alpha*( dE_dW2 )
b2 = b2 - alpha*(dy )
b1 = b1 - alpha*( dE_dZ2T__dA1_dZ1 )
loss_temp[t2] = ( -np.sum( y_train[t2:t2+1]*np.log10(Output.T) ) )
Zi_vector[t2] = Z2.reshape(-1,)
y_pred_vector[t2,:] = Output.reshape(-1,n_class)
#centers = new_center(Zi_vector,y_train, centers)
loss[t1] = ( np.mean(loss_temp))
loss_temp = np.zeros(( 100 ,1))
print_counter = print_counter + 1
if print_counter > 100:
print(t1)
print_counter = 1
test()
plt.plot(loss )
plt.xlabel('iterations')
plt.ylabel('loss')
plt.show()
# plt.scatter( Zi_vector[:,1] , Zi_vector[:,2] )
# plt.show()