python machine-learning optimization pytorch regression

Regression fails with poor initial guess

Consider a regression task where the parameters of the model differ significantly in magnitude, say:

def func(x, p):
    p1, p2, p3 = p
    return np.sin(p1*x) * np.exp(p2*x) * p3

# True Parameters:
p1, p2, p3 = np.pi/0.01, -1.25, 1.2356  # 314.1592, -1.25, 1.2356

The system requires a feasible initial guess to converge to the correct solution, for example:

p0 = [np.pi/0.01-80, -1.25-1, 1.2356-1]

However, if the initial guess is too far from the true solution, the regression may fail to converge. For example, the following initial guess might not work:

p0 = [np.pi/0.02-10, -1.25-1, 1.2356-1]

The reproducible code is as follows:

import numpy as np
from scipy.optimize import least_squares
import matplotlib.pyplot as plt

def func(x, p):
    p1, p2, p3 = p
    return np.sin(p1*x) * np.exp(p2*x) * p3

def residuals(p, y, x):
    return y - func(x, p)

x = np.linspace(0, 0.05, 50)
p1, p2, p3 = np.pi/0.01, -1.25, 1.2356
y0 = func(x, [p1, p2, p3])
y = y0 + np.random.randn(len(x)) * 0.05

p0 = [np.pi/0.02-10, -1.25-1, 1.2356-1]

result = least_squares(residuals, p0, args=(y, x))

print("True parameters:", [p1, p2, p3])
print("Approximated parameters:", result.x)

x_test = np.linspace(0, 0.05, 200)
y_test = func(x_test, result.x)
y_real = func(x_test, [p1, p2, p3])
plt.plot(x_test, y_test, label="predict")
plt.plot(x, y, '.r', label="real")

Or here is a PyTorch version:

import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt

class guessEq(nn.Module):
    def __init__(self):
        super(guessEq, self).__init__()
        self.params = nn.Parameter(torch.tensor([np.pi/0.01-10, -1.25+1, 1.2356-1]))
    
    def forward(self, x):
        out = torch.sin(self.params[0]*x) * \
            torch.exp(-self.params[1]*x) * \
            self.params[2]
        return out

x = np.linspace(0, 0.05, 100)
y = np.sin(np.pi/0.01*x) * np.exp(-1.25*x) * 1.2356 + np.random.rand(x.shape[0]) * 0.05

x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
x = x.reshape((-1, 1))
y = y.reshape((-1, 1))

model = guessEq()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
mse = nn.MSELoss()
for i in range(2000):
    optimizer.zero_grad()
    y_pred = model(x)
    loss = torch.mean(torch.square(y_pred - y))
    loss.backward()
    if i % 100 == 0:
        print(loss)
    optimizer.step()

x_test = torch.linspace(0, 0.05, 200).reshape((-1, 1))
y_test = model(x_test)
print("True parameters: [{} {} {}]".format(np.pi/0.01, -1.25, 1.2356))
print("Approximated parameters: {}".format(model.params.detach().numpy()))
plt.plot(x_test.detach().cpu().numpy().flatten(), y_test.detach().cpu().numpy().flatten(), c="blue", linewidth=2, label="predict")
plt.plot(x.detach().cpu().numpy().flatten(), y.detach().cpu().numpy().flatten(), '.r', label="train")
plt.legend()
plt.show()

How to address the issue when the initial guess is far from the true solution, or when there is no prior of the initial guess?

Solution

Since it most likely gets stuck in the local minimum, and the optimization algorithms are very sensitive to the initial guess, we can start with random initialization multiple times and then choose the solution corresponding to minimum cost in all the runs. It increases the chance of obtaining the global minimum, as shown below (global minimum could be found in 20 runs):

def residuals(p, y, x):
    return (y - func(x, p))**2 # squared error 

np.random.seed(10) # for reproducible result

num_runs = 20 
best_init = None
best_cost = np.inf
best_result = None
p0 = np.zeros(3)

for i in range(num_runs):
    p0 = np.random.normal(size=len(p0)) # random init, can use any other distribution
    result = least_squares(residuals, p0, args=(y, x), method='lm', verbose=2) 
    # print(result.cost)
    if best_cost > result.cost:
        best_init = p0
        best_cost = result.cost
        best_result = result

print(best_init, best_cost)
# [ 0.91745894 -0.11227247 -0.36218045] 0.00041268250199614506
result = best_result

We can try different methods (trf, dogbox, lm) for least squares and also explicitly provide the jacobian too and see if it gets better results.