pythonneural-networkbackpropagationcross-entropymlp

why does my neural network coded from scratch results have such a weird loss trend?


I am developing neural network from scratch. It consists of of the following input > layer1(sigmoid) > layer2 > output(softmax). The basic coding is complete but when I ran it, I obtain a weird loss plot. Also if I ran the code for large iterations I obtain 'nan' in the output. Any help is appreciated. loss

I spent a bit of time going through the derivations and checking my implementation but cannot spot what the issue is. See my derivations attached, I am interested in the softmax loss and sorry about the writing. derivations link https://drive.google.com/file/d/1VBSO8Ox6U3Vo9FvgT8gKUBZPEEEjzMS_/view?usp=drivesdk

code


import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn as sk
from sklearn import preprocessing
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt

(x_train,y_train),(x_test,y_test)=mnist.load_data()

n_train = 100
x_train = x_train[0:n_train].reshape(-1,784)/255
y_train0 = y_train[0:n_train].reshape(-1,)
y_train1 = pd.get_dummies(y_train0)
y_train = np.array( y_train1.astype(int) )


nx = 784 #input size (nx,1)
n1 = 20   #neurons in 1st layer
n_class = 10  #neurons in 2nd layer


lambda0 = 0.00  #center loss parameter
alpha =  0.0001  #gradient decent parameter

#layer weights
#The network is simple 
#Input, n1 neurons with sigmoid activation, 10 neurons and softmax output
W1 = 0.001*np.random.rand( n1, nx )
W2 = 0.001*np.random.rand( n_class, n1 )
b2 = 0.001*np.random.rand( n_class,1  )
b1 = 0.001*np.random.rand( n1,1  )
centers = np.random.rand(n_class,1)


def softmax(x):
 
    exp_sum = np.sum( np.exp(x) )
    
    return np.exp(x)/( exp_sum )

def sigmoid(x):
    y = 1/(1+ np.exp(-x) )
    
    return y



def test():
    count = 0
    for t3 in range(0,100):
        
        #forward propagation #Layer 1
        Z1 = np.dot( W1 , x_train[t3]).reshape(-1,1)  
        A1 = sigmoid( Z1 ).reshape(-1,1)         
        #forward propagation #Layer 2
        Z2 = np.dot( W2, A1)  + b2
        Output = softmax(Z2)
        
        
        arg_max =  np.argmax(Output)
        
        if arg_max == np.argmax(y_train[t3:t3+1] ):
            count = count +1
            
    
    print(count/100)        
    

def new_center(x,y0,c0):
    c = np.zeros( (n_class, 1) )
    y = np.argmax(y0, axis=-1) #convert to one column array
    for i in range(n_class):
        indx = np.where(y == i)[0] #choose all points that equal class i
        a1 = np.mean( x[ [indx] ].reshape(-1,n_class)  )
        c[i,:] = a1
        
    return c-c0

iter = 50000
sets = 100
loss_temp = np.zeros(( sets ,1))
loss = []
Zi_vector =  np.zeros(( sets ,n_class))
y_pred_vector = np.zeros(( sets ,n_class))
loss = np.zeros(( iter,1))

print_counter = 0
for t1 in range (0,iter):
    
    for t2 in range(0,sets):
    
        #forward propagation #Layer 1 & 2
        Z1 = np.dot( W1 , x_train[t2:t2+1].T ).reshape(-1,1)  + b1
        A1 = sigmoid( Z1 ).reshape(-1,1)         
        #forward propagation #Layer 2
        Z2 = np.dot( W2, A1)  + b2
        Output = softmax(Z2)


        #back propagation #Layer 2
        dy = Output - y_train[t2:t2+1].T #+ lambda0*(Z2- centers)
        
        dE_dZ2 = dy
        dE_dW2 = dy*A1.T
        dE_db2 = dy
        W2T = W2.T
        
        
        #backpropagation #Layer 1
        dE_dZ2T = np.zeros((n1,1))
        dA1_dZ1 = Z1*(1-Z1) 
        
        for temp1 in range(0,n1):
            dE_dZ2T[temp1] = np.dot( dy.T, W2T[temp1].T )
            
        dE_dZ2T__dA1_dZ1 = dE_dZ2T*dA1_dZ1  
        dE_dW1 = dE_dZ2T__dA1_dZ1 * x_train[t2].T
        
        
        # #For regularization
        # L2_W2 = W2*0.001
        # L2_W1 = W1*0.001
        # L2_b2 = b2*0.001
        # L2_b1 = b1*0.001
        
        #weight update
        W1 = W1 - alpha*(dE_dW1  )               
        W2 = W2 - alpha*( dE_dW2   )
        b2 = b2 - alpha*(dy   )
        b1 = b1 - alpha*( dE_dZ2T__dA1_dZ1   )
        
        
        loss_temp[t2] = (  -np.sum( y_train[t2:t2+1]*np.log10(Output.T) ) )
        

        Zi_vector[t2] = Z2.reshape(-1,)
        y_pred_vector[t2,:] = Output.reshape(-1,n_class)


    #centers = new_center(Zi_vector,y_train, centers)        
        
    loss[t1] = ( np.mean(loss_temp)) 
    loss_temp = np.zeros(( 100 ,1))


    print_counter = print_counter + 1
    if print_counter > 100:
        print(t1) 
        print_counter = 1 

        
test()

plt.plot(loss )
plt.xlabel('iterations')
plt.ylabel('loss')
plt.show()


Solution

  • I am not sure what fixed it but, I 1) scaled the the inputs into softmax function and 2) rewrote the weight update for W2.

    The loss looks as expected Loss.

    The loss becomes zero after 10000 iterations also the accuracy becomes 1 which is what I was expecting for an overfitted model trained on few data.

    code

    import numpy as np
    import pandas as pd
    import tensorflow as tf
    import sklearn as sk
    from sklearn import preprocessing
    from tensorflow.keras.datasets import mnist
    import matplotlib.pyplot as plt
    
    (x_train,y_train),(x_test,y_test)=mnist.load_data()
    
    n_train = 100
    x_train = x_train[0:n_train].reshape(-1,784)/255
    y_train0 = y_train[0:n_train].reshape(-1,)
    y_train1 = pd.get_dummies(y_train0)
    y_train = np.array( y_train1.astype(int) )
    
    
    nx = 784 #input size (nx,1)
    n1 = 20   #neurons in 1st layer
    n_class = 10  #neurons in 2nd layer
    
    
    lambda0 = 0.00  #center loss parameter
    alpha =  0.0001  #gradient decent parameter
    
    #layer weights
    #The network is simple 
    #Input, n1 neurons with sigmoid activation, 10 neurons and softmax output
    W1 = 0.001*np.random.rand( n1, nx )
    W2 = 0.001*np.random.rand( n_class, n1 )
    b2 = 0.001*np.random.rand( n_class,1  )
    b1 = 0.001*np.random.rand( n1,1  )
    centers = np.random.rand(n_class,1)
    
    
    def softmax(x):
        xmin = np.min(x)
        xmax = np.max(x)
        b = 100
        a = 1
        xnormalized = (b-a)*(x-xmin)/(0.001 +xmax - xmin) + a
        exp_sum = np.sum( np.exp( xnormalized ) )
        
        return np.exp( xnormalized )/( exp_sum )
    
    def sigmoid(x):
        y = 1/(1+ np.exp(-x) )
        
        return y
    
    
    
    def test():
        count = 0
        for t3 in range(0,100):
            
            #forward propagation #Layer 1
            Z1 = np.dot( W1 , x_train[t3]).reshape(-1,1)  
            A1 = sigmoid( Z1 ).reshape(-1,1)         
            #forward propagation #Layer 2
            Z2 = np.dot( W2, A1)  + b2
            Output = softmax(Z2)
            
            
            arg_max =  np.argmax(Output)
            
            if arg_max == np.argmax(y_train[t3:t3+1] ):
                count = count +1
                
        
        print(count/100)        
        
    
    def new_center(x,y0,c0):
        c = np.zeros( (n_class, 1) )
        y = np.argmax(y0, axis=-1) #convert to one column array
        for i in range(n_class):
            indx = np.where(y == i)[0] #choose all points that equal class i
            a1 = np.mean( x[ [indx] ].reshape(-1,n_class)  )
            c[i,:] = a1
            
        return c-c0
    
    iter = 20000
    sets = 100
    loss_temp = np.zeros(( sets ,1))
    loss = []
    Zi_vector =  np.zeros(( sets ,n_class))
    y_pred_vector = np.zeros(( sets ,n_class))
    loss = np.zeros(( iter,1))
    
    print_counter = 0
    for t1 in range (0,iter):
        
        for t2 in range(0,sets):
        
            #forward propagation #Layer 1 & 2
            Z1 = np.dot( W1 , x_train[t2:t2+1].T ).reshape(-1,1)  + b1
            A1 = sigmoid( Z1 ).reshape(-1,1)         
            #forward propagation #Layer 2
            Z2 = np.dot( W2, A1)  + b2
            Output = softmax(Z2)
    
    
            #back propagation #Layer 2
            dy = Output - y_train[t2:t2+1].T #+ lambda0*(Z2- centers)
            
            dE_dZ2 = dy
            dE_dW2 = dy*A1.T
            dE_db2 = dy
            W2 = W2
            
            
            #backpropagation #Layer 1
            dE_dZ2T = np.zeros((n1,1))
            dA1_dZ1 = Z1*(1-Z1) 
            
            # for temp1 in range(0,n1):
            #     dE_dZ2T[temp1] = np.dot( dy.T, W2[temp1].T )
                
            dE_dZ2T__dA1_dZ1 = np.dot(W2.T, dy) * dA1_dZ1 
            dE_dW1 = dE_dZ2T__dA1_dZ1 * x_train[t2].T
            
            
            # #For regularization
            # L2_W2 = W2*0.001
            # L2_W1 = W1*0.001
            # L2_b2 = b2*0.001
            # L2_b1 = b1*0.001
            
            #weight update
            W1 = W1 - alpha*(dE_dW1  )               
            W2 = W2 - alpha*( dE_dW2   )
            b2 = b2 - alpha*(dy   )
            b1 = b1 - alpha*( dE_dZ2T__dA1_dZ1   )
            
            
            loss_temp[t2] = (  -np.sum( y_train[t2:t2+1]*np.log10(Output.T) ) )
            
    
            Zi_vector[t2] = Z2.reshape(-1,)
            y_pred_vector[t2,:] = Output.reshape(-1,n_class)
    
    
        #centers = new_center(Zi_vector,y_train, centers)        
            
        loss[t1] = ( np.mean(loss_temp)) 
        loss_temp = np.zeros(( 100 ,1))
    
    
        print_counter = print_counter + 1
        if print_counter > 100:
            print(t1) 
            print_counter = 1 
    
            
    test()
    
    plt.plot(loss )
    plt.xlabel('iterations')
    plt.ylabel('loss')
    plt.show()
    
    # plt.scatter( Zi_vector[:,1]  , Zi_vector[:,2] )
    # plt.show()