I've programmed a linear regression model from scratch. I use the "Sum of squared residuals" as the loss function for gradient descent. For testing I use linear data (y=x)
When running the algorithm, intercept b barely changes. Thus the slope m is not calculated correctly.
%matplotlib qt5
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
X = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12345)
class LinearRegression():
def __init__(self):
self.X = None
self.y = None
def ssr(self, m, b):
sum = 0
for i in range(len(self.X)):
sum += (self.y[i] - (m * self.X[i] + b) ) ** 2
return sum
def ssr_gradient(self, m, b):
sum_m = 0
sum_b = 0
n = len(self.X)
for i in range(n):
error = self.y[i] - (m * self.X[i] + b)
derivative_m = -(2/n) * self.X[i] * error # Derivative w.r.t. m
derivative_b = -(2/n) * error # Derivative w.r.t. b
sum_m += derivative_m
sum_b += derivative_b
return sum_m, sum_b
def fit(self, X, y, m, b): # Gradient Descent
self.X = X
self.y = y
M, B = np.meshgrid(np.arange(-10, 10, 0.1), np.arange(-10, 10, 0.1))
SSR = np.zeros_like(M)
for i in range(M.shape[0]):
for j in range(M.shape[1]):
SSR[i, j] = self.ssr(M[i, j], B[i, j])
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
gd_model = fig.add_subplot(121, projection="3d", computed_zorder=False)
lin_reg_model = axes[1]
current_pos = (m, b, self.ssr(m, b))
learning_rate = 0.001
min_step_size = 0.001
max_steps = 1000
current_steps = 0
while(current_steps < max_steps):
M_derivative, B_derivative = self.ssr_gradient(current_pos[0], current_pos[1])
M_step_size, B_step_size = M_derivative * learning_rate, B_derivative * learning_rate
if abs(M_step_size) < min_step_size or abs(B_step_size) < min_step_size:
break
M_new, B_new = current_pos[0] - M_step_size, current_pos[1] - B_step_size
current_pos = (M_new, B_new, self.ssr(M_new, B_new))
print(f"Parameters: m: {current_pos[0]}; b: {current_pos[1]}; SSR: {current_pos[2]}")
current_steps += 1
x = np.arange(0, 10, 1)
y = current_pos[0] * x + current_pos[1]
lin_reg_model.scatter(X_train, y_train, label="Train", s=75, c="#1f77b4")
lin_reg_model.plot(x, y)
gd_model.plot_surface(M, B, SSR, cmap="viridis", zorder=0)
gd_model.scatter(current_pos[0], current_pos[1], current_pos[2], c="red", zorder=1)
gd_model.set_xlabel("Slope m")
gd_model.set_ylabel("Intercept b")
gd_model.set_zlabel("Sum of squared residuals")
plt.tight_layout()
plt.pause(0.001)
gd_model.clear()
lin_reg_model.clear()
self.m = current_pos[0]
self.b = current_pos[1]
def predict(self, X_test):
return self.m * X_test + self.b
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train, 1, 10)
This is the result for initial values m=1 and b=10: Parameters: m: -0.45129949840919587; b: 9.50972664859535; SSR: 145.06534359577407
Obviously this isn't optimal because my data is linear. So the optimal parameters should be m=1 and b=0 But I cannot find the problem in my code. The algorithm prints different results based on the initial values, but it should print the same result over and over again as long as there is exactly one minimum of the SSR function.
I tried using different learning rates but the problem persists.
Thanks to chrslg: I removed the division by n in the derivatives. After changing the parametres for the learning rate and max iteration, the algorithm is much faster and also optimizes b. Using more data than only the small toy data set was helpful. And thanks Dr. Snoopy: It was helpful to plot the loss for each epoch.