I am writing a neural network in C++ to approximate the xSin(x) function using a single hidden layer with 5 hidden neurons. The hidden neurons use tanh activation and the output layer uses Linear activation. I used 30 training examples for 10,000 epochs.
Until I shuffled my data, this is what I got: (RED: Predicted data, GREEN: Actual Data), also the MSE was near 0
But when I shuffle the indices of the training examples and verify that my shuffling does shuffle, I get terrible results:
and the Error vs Epoch as:
What could possibly go wrong? Can shuffling be responsible for this?
Here is the simple code for reference
//Shuffle Function
void shuffle(int *array, size_t n)
{
if (n > 1) //If no. of training examples > 1
{
size_t i;
for (i = 0; i < n - 1; i++)
{
size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
int t = array[j];
array[j] = array[i];
array[i] = t;
}
}
}
int main(int argc, const char * argv[])
{
//Some other actions
///FOR INDEX SHUFFLING
int trainingSetOrder[numTrainingSets];
for(int j=0; j<numTrainingSets; ++j)
trainingSetOrder[j] = j;
///TRAINING
//std::cout<<"start train\n";
vector<double> performance, epo; ///STORE MSE, EPOCH
for (int n=0; n < epoch; n++)
{
shuffle(trainingSetOrder,numTrainingSets);
for (int i=0; i<numTrainingSets; i++)
{
int x = trainingSetOrder[i];
//cout<<" "<<"("<<training_inputs[x][0]<<","<<training_outputs[x][0] <<")";
/// Forward pass
for (int j=0; j<numHiddenNodes; j++)
{
double activation=hiddenLayerBias[j];
//std::cout<<"Training Set :"<<x<<"\n";
for (int k=0; k<numInputs; k++) {
activation+=training_inputs[x][k]*hiddenWeights[k][j];
}
hiddenLayer[j] = tanh(activation);
}
for (int j=0; j<numOutputs; j++) {
double activation=outputLayerBias[j];
for (int k=0; k<numHiddenNodes; k++)
{
activation+=hiddenLayer[k]*outputWeights[k][j];
}
outputLayer[j] = lin(activation);
}
/// Backprop
/// For V
double deltaOutput[numOutputs];
for (int j=0; j<numOutputs; j++) {
double errorOutput = (training_outputs[i][j]-outputLayer[j]);
deltaOutput[j] = errorOutput*dlin(outputLayer[j]);
}
/// For W
//Some Code
///Updation
/// For V and b
///Some Code
/// For W and c
for (int j=0; j<numHiddenNodes; j++) {
//c
hiddenLayerBias[j] += deltaHidden[j]*lr;
//W
for(int k=0; k<numInputs; k++) {
hiddenWeights[k][j]+=training_inputs[i][k]*deltaHidden[j]*lr;
}
}
}
}
return 0;
}
I found 2 mistakes (silly!) in the training part:
1)
/// Backprop
/// For V
double deltaOutput[numOutputs];
for (int j=0; j<numOutputs; j++) {
double errorOutput = (training_outputs[i][j]-outputLayer[j]);
deltaOutput[j] = errorOutput*dlin(outputLayer[j]);
}
should have been
/// Backprop
/// For V
double deltaOutput[numOutputs];
for (int j=0; j<numOutputs; j++) {
double errorOutput = (training_outputs[x][j]-outputLayer[j]);
deltaOutput[j] = errorOutput*dlin(outputLayer[j]);
}
2)
/// For W and c
for (int j=0; j<numHiddenNodes; j++) {
//c
hiddenLayerBias[j] += deltaHidden[j]*lr;
//W
for(int k=0; k<numInputs; k++) {
hiddenWeights[k][j]+=training_inputs[i][k]*deltaHidden[j]*lr;
}
}
should have been
/// For W and c
for (int j=0; j<numHiddenNodes; j++) {
//c
hiddenLayerBias[j] += deltaHidden[j]*lr;
//W
for(int k=0; k<numInputs; k++) {
hiddenWeights[k][j]+=training_inputs[x][k]*deltaHidden[j]*lr;
}
}