I'm trying to build a multiple linear regression model for boston
dataset in scikit-learn.
I use Stochastic Gradient Descent (SGD) to optimize the model. And it seems like I have to use very small learning rate(0.000000001) to make model learn. If I use bigger learning rate, the model fails to learn and diverges to NaN or inf.
So, here's my questions:
Here's my code:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
def loss(x, y, w):
predict_y = x @ w
return np.sqrt(np.mean(np.square((y - predict_y))))
def status(w):
w_ = np.squeeze(w)
print("w = [", end="")
for i in range(14):
if(i == 13):
print(w_[i], end="]")
else:
print(w_[i], end=", ")
print()
training_loss = loss(training_x, training_y, w)
validation_loss = loss(validation_x, validation_y, w)
print("Training Loss = " + str(training_loss))
print("Validation Loss = " + str(validation_loss))
training_predict_y = training_x @ w
validation_predict_y = validation_x @ w
print("{:^40s}|{:^40s}".format("training", "validation"))
print("{:^20s}{:^20s}|{:^20s}{:^20s}".format("predict_y", "true_y", "predict_y", "true_y"))
for i in range(10):
print("{:^20f}{:^20f}|{:^20f}{:^20f}".format(float(training_predict_y[i]), float(training_y[i]), float(validation_predict_y[i]), float(validation_y[i])))
print()
def plot(title, data):
plt.title(title)
plt.plot(range(len(data)), data)
plt.savefig(title + ".png", dpi = 300)
plt.show()
np.random.seed(2020) # for consistency
# data
dataset = datasets.load_boston()
x = dataset.data
y = dataset.target
# reformat the data
x_ = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1) # x0 = 1인 열 추가
y_ = np.expand_dims(y, axis=1)
# divide data into training set and validation set
training_x = x_[ 0:406, : ]
training_y = y_[ 0:406, : ]
validation_x = x_[ 406:506, : ]
validation_y = y_[ 406:506, : ]
# initialize w
w = np.random.rand(x_.shape[1], 1)
print("Before Training...")
status(w)
# hyperparameter
epochs = 100000
lr = 0.000000001
training_losses = []
validation_losses = []
data_num = training_x.shape[0]
for epoch in range(epochs):
for i in range(data_num):
sample = training_x[ i:i + 1, : ]
true_y = training_y[ i:i + 1, : ]
predict_y = sample @ w
# calculate gradient
gradient = -(2 / sample.shape[0]) * sample.T @ (true_y - predict_y)
# update w
w = w - lr * gradient
training_loss = loss(training_x, training_y, w)
validation_loss = loss(validation_x, validation_y, w)
training_losses.append(training_loss)
validation_losses.append(validation_loss)
print("After Training...")
status(w)
plot("Training Loss - SGD", training_losses)
plot("Validation Loss - SGD", validation_losses)
The reason for that mysterious small learning rate is because I did not normalize the input data. Data must be normalized if the scale of features varies. I normalized the data, and I could make the model learned with reasonable learning rate(0.001).