I am trying to solve a simple optimization problem but cant get around the error "positive directional derivative for linesearch" and am wondering what is going on here, i dont see anything problematic for the gradient, the problem is convex but nonlinear. See the code below for reference:
import numpy as np
bounds = [(6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (6.274329865160556e-06, 14.919501068721157), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.0005248360152365689, 27.172072902944304), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635), (0.00014238207960809104, 60.130237219435635)]
maxcovers = {'x1;cost': 2.5, 'x2;cost': 2.5, 'x3;cost': 2.5}
maxjumps = {'x1;cost': 2.5, 'x2;cost': 2.5, 'x3;cost': 2.5}
paramvaluemappings = {"saturations": {0: np.full((1, 31), 0.234835),
1: np.full((1, 31), 0.339884),
2: np.full((1, 31), 0.583774)},
"timevariables": {0: np.full((1, 31), -1.206971),
1: np.full((1, 31), 0.36720),
2: np.full((1, 31), 1.0164)},
"adstock": {0: np.zeros((1)),
1: np.zeros((1)),
2: np.zeros((1))},
"intercept": np.full((1, 31), 0.7899)}
totalbudget = 194
x0 = [0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01000627432986516, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.01052483601523657, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091, 0.010142382079608091]
from scipy.optimize import minimize
cons = []
totalbudgetconstraint = {'type': 'ineq',
'fun': lambda x: totalbudget - sum(x)}
cons.append(totalbudgetconstraint)
def newobjective2(x, *args):
paramvaluemappingsordered = args[0]
horizon = args[3]
channels = args[4]
lookbackadstock = 4
previousxs = {0: np.array([0.06926, 0.70988, 0.00676, 1.4556, 0.0006, 0.0006]),
1: np.array([0.64502, 0.56519, 2.38517, 0.05514, 2.65093, 0.00538]),
2: np.array([0.15333, 0.01496, 0.00146, 1.57163, 1.84324, 5.86636])}
summed = 0
for idx, channel in enumerate(channels):
saturationentries = paramvaluemappingsordered['saturations'][idx]# .data[firstdimension,seconddimension, :]
timevariableentries = paramvaluemappingsordered['timevariables'][idx] #.data[firstdimension, seconddimension, :]
expenditurechannel = np.append(previousxs[idx],
x[idx * horizon: idx * horizon + horizon])
channelentry = timevariableentries * np.power(expenditurechannel, saturationentries)
summed += channelentry
intercept = paramvaluemappingsordered['intercept'] #.data[firstdimension, seconddimension,:]
summed += intercept
summed = summed[:, lookbackadstock:]
meanvalue = np.mean(summed, axis=0)
return -np.sum(meanvalue)
args = [paramvaluemappings, None, None, 25, ["x1;cost", "x2;cost", "x3;cost"]]
res = minimize(fun=newobjective2,
x0=x0,
args=(paramvaluemappings, None, None, 25, ["x1;cost", "x2;cost", "x3;cost"]),
constraints=cons,
method='SLSQP', # 'trust-constr', # 'SLSQP', # method='trust-constr',
bounds=bounds,
options={'maxiter': 500_0,
# 'xtol': 1e-20,
'disp': True, # ,
'xtol': 1e-5,
# 'verbose': 2
# ,
'ftol': 1e-8
}
)
print(res)
It is a simple allocation problem with linear constraints and a nonlinear reward function, should be solvable with SLSQP and should result in a global optimum since the problem is convex.
changing x0 to e.g += 1 works... i want to come up with a way of setting x0 automatically though without doing tricks with it..and also understand why this issue appears..
note that the initial x0 satisfies the constraints..
increasing ftol also works.. for xtol > 1e-7.. i wonder why..
I found that I can successfully optimize this function with two changes.
Rewrite budget as linear constraint.
totalbudgetconstraint = scipy.optimize.LinearConstraint(A=np.ones((1, len(x0))), lb=-np.inf, ub=totalbudget)
This requires that the sum of x add up to less than totalbudget.
Remove intercept. This function adds a constant intercept to the output just before taking the mean.
summed += intercept
Since optimizing the function g(x) = f(x) + c is the same as optimizing f(x), this line can be commented out.
I think this helps because summed
contains a bunch of numbers that are small relative to the intercept, and adding a large number to a small number causes the combined number to lose precision.
Alternatively, if you need this intercept for other reasons, you can make the intercept conditional, and only include it while not optimizing the function. Example:
def newobjective2(x, *args, with_intercept=False):
# rest of function same as before
if with_intercept:
summed += intercept
# ...
With these changes, I get the following result:
Optimization terminated successfully (Exit mode 0)
Current function value: -90.8754619614387
Iterations: 21
Function evaluations: 1325
Gradient evaluations: 17
message: Optimization terminated successfully
success: True
status: 0
fun: -90.8754619614387
x: [ 6.274e-06 6.274e-06 ... 7.413e+00 7.423e+00]
nit: 21
jac: [ 2.709e+03 2.709e+03 ... -2.578e-01 -2.576e-01]
nfev: 1325
njev: 17
with intercept -112.2027619614387
i dont see anything problematic for the gradient, the problem is convex but nonlinear
Looking at the gradient of this function, I see that the first 25 variables have huge gradients relative to other variables.
>>> scipy.optimize.approx_fprime(x0, newobjective2, 1.4901161193847656e-08, *args)
array([2710.27860928, 2710.27860928, 2710.27860928, 2710.27860928,
2710.27860928, 2710.27860928, 2710.27861023, 2710.27861023,
2710.27860928, 2710.27860928, 2710.27860928, 2710.27860928,
2710.27860928, 2710.27860928, 2710.27861023, 2710.27861023,
2710.27860928, 2710.27860928, 2710.27860928, 2710.27860928,
2710.27860928, 2710.27860928, 2710.27861023, 2710.27861023,
2710.27861023, -0.04204273, -0.04204273, -0.04204369,
-0.04204369, -0.04204369, -0.04204369, -0.04204369,
-0.04204273, -0.04204273, -0.04204273, -0.04204369,
-0.04204369, -0.04204369, -0.04204369, -0.04204273,
-0.04204273, -0.04204273, -0.04204273, -0.04204369,
-0.04204369, -0.04204369, -0.04204369, -0.04204273,
-0.04204273, -0.04204273, -0.40110588, -0.40110588,
-0.40110779, -0.40110779, -0.40110683, -0.40110683,
-0.40110588, -0.40110588, -0.40110588, -0.40110588,
-0.40110779, -0.40110779, -0.40110683, -0.40110683,
-0.40110588, -0.40110588, -0.40110588, -0.40110588,
-0.40110779, -0.40110779, -0.40110683, -0.40110683,
-0.40109634, -0.40109634, -0.40109634])
This is something that minimize doesn't tolerate very well, but I don't understand the function you're optimizing in enough detail to know how to improve it. It's also a little suspicious that some of the variables have the exact same gradients - that suggests that you could re-parameterize this into fewer variables.