I'm training a simple ANN model (MLP) using as the activation function tanh(x)
and, after some interactions, it converges with error equal to 10^-5, here's my full code:
import numpy as np
import pandas as pd
# Base de dados a ser treinada
x = pd.DataFrame(
[[1],
[2],
[3]],
columns=['valores x'])
d = pd.DataFrame(
[[5],
[4],
[3]],
columns=['valores desejados'])
# Convertendo o dataframe em array e normalizando os valores desejados para ficar entre 0 e +1.
x = x.to_numpy()
d = d/(1.05*d.max())
d = d.to_numpy()
# Derivada de tanh(x) = sech²(x) = 1 - (tanh(x))²
def df(x):
y = 1 - np.power(np.tanh(x), 2)
return y
#def rede_mlp(n, x, d, net, k, precisao):
# Construindo a rede de duas camadas
# net = número de neurônios na primeira camada
# n = taxa de aprendizagem
# precisao = precisão do erro quadrático médio
net=3
n = 0.1
precisao=0.00001
w1 = np.random.rand(net,len(x[0]))
w2 = np.random.rand(1,net)
E_M=1
epocas=0
while E_M>precisao:
E_M=0
errofinal=0
for i in range(0,len(x)):
# FOWARD
i1 = np.matmul(w1, x[i].reshape(len(x[i]),1))
y1 = np.tanh(i1)
i2 = np.matmul(w2, y1)
y2 = np.tanh(i2)
# erro com o valor desejado
erro = d[i].reshape(len(d[i]),1) - y2
# BACKPROPAGATION
delta_2 = erro*df(i2)
w2 = w2 + n*(np.matmul(delta_2, y1.reshape(1, net)))
delta_1 = (np.matmul(w2.T, delta_2))*df(i1)
w1 = w1 + n*(np.matmul(delta_1, x[i].reshape(1, len(x[i]))))
errofinal = errofinal + 0.5*erro**2
E_M = errofinal/len(x)
epocas+=1
print(E_M)
After that, I tried to change the activation function to leaky ReLu, but it didn't converge. I have changed the learning rate n
several times, but the error is still high. It's around 7.95, which is big for my data. Here's my try:
import numpy as np
import pandas as pd
# Base de dados a ser treinada
x = pd.DataFrame(
[[1],
[2],
[3]],
columns=['valores x'])
d = pd.DataFrame(
[[5],
[4],
[3]],
columns=['valores desejados'])
# Convertendo o dataframe em array e normalizando os valores desejados para ficar entre 0 e +1.
x = x.to_numpy()
d = d.to_numpy()
def df(x):
x = np.array(x)
x[x<=0] = 0.01
x[x>0] = 1
return x
def f(x):
return(np.where(x > 0, x, x * 0.01))
#def rede_mlp(n, x, d, net, k, precisao):
# Construindo a rede de duas camadas
# net = número de neurônios na primeira camada
# n = taxa de aprendizagem
# precisao = precisão do erro quadrático médio
net=3
n = 1e-4
precisao=0.0001
w1 = np.random.rand(net,len(x[0]))
w2 = np.random.rand(1,net)
E_M=20
epocas=0
while E_M>precisao:
E_M=0
errofinal=0
for i in range(0,len(x)):
# FOWARD
i1 = np.matmul(w1, x[i].reshape(len(x[i]),1))
y1 = f(i1)
i2 = np.matmul(w2, y1)
y2 = f(i2)
# erro com o valor desejado
erro = d[i].reshape(len(d[i]),1) - y2
# BACKPROPAGATION
delta_2 = erro*df(i2)
w2 = w2 + n*(np.matmul(delta_2, y1.reshape(1, net)))
delta_1 = (np.matmul(w2.T, delta_2))*df(i1)
w1 = w1 + n*(np.matmul(delta_1, x[i].reshape(1, len(x[i]))))
errofinal = errofinal + 0.5*erro**2
#E_M = errofinal/len(x)
E_M = errofinal
epocas+=1
print(E_M)
EDITED:
After some modifications, here's my ReLu code (but the error is still high ~7.77):
import numpy as np
import pandas as pd
# Base de dados a ser treinada
x = pd.DataFrame(
[[1],
[2],
[3]],
columns=['valores x'])
d = pd.DataFrame(
[[5],
[4],
[3]],
columns=['valores desejados'])
# Convertendo o dataframe em array e normalizando os valores desejados para ficar entre 0 e +1.
x = x.to_numpy()
d = d.to_numpy()
def df(x):
return(np.where(x <= 0, 0.01, 1))
def f(x):
return(np.where(x > 0, x, x * 0.01))
#def rede_mlp(n, x, d, net, k, precisao):
# Construindo a rede de duas camadas
# net = número de neurônios na primeira camada
# n = taxa de aprendizagem
# precisao = precisão do erro quadrático médio
net=3
n = 1e-3
precisao=0.1
w1 = np.random.rand(net,len(x[0]))
w2 = np.random.rand(1,net)
E_M=20
epocas=0
while E_M>precisao:
E_M=0
errofinal=0
for i in range(0,len(x)):
# FOWARD
i1 = np.matmul(w1, x[i].reshape(len(x[i]),1))
y1 = f(i1)
i2 = np.matmul(w2, y1)
y2 = f(i2)
# erro com o valor desejado
erro = d[i].reshape(len(d[i]),1) - y2
# BACKPROPAGATION
delta_2 = erro*df(i2)
delta_1 = (np.matmul(w2.T, delta_2))*df(i1)
w2 = w2 + n*(np.matmul(delta_2, y1.reshape(1, net)))
w1 = w1 + n*(np.matmul(delta_1, x[i].reshape(1, len(x[i]))))
errofinal = errofinal + 0.5*erro**2
#E_M = errofinal/len(x)
E_M = errofinal
epocas+=1
print(E_M)
You need to add a bias to the network.
The equation you are trying to model is y = 6 - x
, which is trivial if you can use 6
as an intercept (bias), but I think actually impossible if you do not.
Many functions are much easier to represent once you add the bias, which is why including one is standard practice. This Q&A on the role of bias in NNs explains more thoroughly.
I modified your code to add the bias, as well as follow more typical naming conventions, and it converges for me.
net = 3
n = 1e-3
precisao = 0.0001
w1 = np.random.rand(net, len(x[0]))
bias1 = np.random.rand()
w2 = np.random.rand(1, net)
bias2 = np.random.rand()
E_M = 20
epocas = 0
while E_M > precisao:
E_M = 0
errofinal = 0
for i in range(0,len(x)):
a0 = x[i].reshape(-1, 1)
targ = d[i].reshape(-1, 1)
z1 = np.matmul(w1, a0) + bias1
a1 = f(z1)
z2 = np.matmul(w2, a1) + bias2
a2 = f(z2)
erro = a2 - targ
# BACKPROPAGATION
delta_2 = erro * df(z2)
delta_1 = np.matmul(w2.T, delta_2) * df(z1)
bias2 -= n * delta_2
bias1 -= n * delta_1
w2 -= n * np.matmul(delta_2, a1.T)
w1 -= n * np.matmul(delta_1, a0.T)
errofinal = errofinal + 0.5*erro**2
#E_M = errofinal/len(x)
E_M = errofinal
epocas += 1
if epocas % 1000 == 0:
print(epocas, E_M)
I increased the learning rate so it would converge more quickly.
1000 [[0.14401507]]
2000 [[0.00028834]]
Earlier bug fix suggestion
You are setting the derivative always equal to 1.
def df(x):
x = np.array(x)
x[x<=0] = 0.01
x[x>0] = 1
return x
The line x[x<=0] = 0.01
sets all non-positive values to 1/100
, a positive value. After that every value is positive, since the already-positive values go through unaffected and the negative-or-zero values just turned positive. So the next line x[x>0] = 1
sets all derivatives to 1
.
Try this:
def df(x):
return np.where(np.array(x) <= 0, 0.01, 1)