I am looking to fit a log-normal curve to data that roughly follows a lognormal distribution.
The data I have is from a laser diffraction machine which measures particle size distributions of sprays. The ultimate goal of this code is to recreate this method for my data, which uses OriginPro software designed for XRD data curve fitting; a similar problem. I would like to integrate the method into my own analysis for my research, which is being done in Python.
I adapted the code from this post to (ideally) handle log-normal distributions. I have simplified my code to handle only the first log-normal peak in the data, so now it is only trying to fit ONE log-normal distribution. The data I have provided are also simplified to only have one peak to fit. Sample data and code are given at the bottom of this post.
I have some previous experience with model fitting using LMFIT, though I was using a user-defined state-space model for temporal modelling and the LMFIT minimize()
function. I am unsure of where to even start debugging the curve-fitting component of this code.
Can anyone help me figure out why I am unable to get a fit to this data? Note that the result I am getting is a trivial one (straight line at y=0).
Working on Windows 7 (laptop) and 10 (desktop)
Running python -V in a CMD window gives:
Python 3.5.3 :: Anaconda 4.1.1 (64-bit)
Here's the data for a sample distribution:
sizes = np.array([ 1.26500000e-01, 1.47000000e-01, 1.71500000e-01,
2.00000000e-01, 2.33000000e-01, 2.72000000e-01,
3.17000000e-01, 3.69500000e-01, 4.31000000e-01,
5.02500000e-01, 5.86000000e-01, 6.83500000e-01,
7.97000000e-01, 9.29000000e-01, 1.08300000e+00,
1.26250000e+00, 1.47200000e+00, 1.71650000e+00,
2.00100000e+00, 2.33300000e+00, 2.72050000e+00,
3.17200000e+00, 3.69800000e+00, 4.31150000e+00,
5.02700000e+00, 5.86100000e+00, 6.83300000e+00,
7.96650000e+00, 9.28850000e+00, 1.08295000e+01,
1.26265000e+01, 1.47215000e+01, 1.71640000e+01,
2.00115000e+01, 2.33315000e+01, 2.72030000e+01,
3.17165000e+01, 3.69785000e+01, 4.31135000e+01,
5.02665000e+01, 5.86065000e+01, 6.83300000e+01,
7.96670000e+01, 9.28850000e+01, 1.08296000e+02,
1.26264000e+02, 1.47213000e+02, 1.71637500e+02,
2.00114500e+02, 2.33316500e+02])
y_exp = np.array([ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.01,
0.02, 0.03, 0.04, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
0.13, 0.19, 0.3 , 0.48, 0.74, 1.1 , 1.56, 2.11, 2.72,
3.37, 3.99, 4.55, 4.99, 5.3 , 5.48, 5.53, 5.48, 5.36,
5.19, 4.97, 4.67, 4.28, 3.79, 3.18, 2.48, 1.73, 1. ,
0.35, 0. , 0. , 0. , 0. ])
Here are the functions:
def generate_model(spec):
composite_model = None
params = None
x = spec['x']
y = spec['y']
x_min = np.min(x)
x_max = np.max(x)
x_range = x_max - x_min
y_max = np.max(y)
for i, basis_func in enumerate(spec['model']):
# prefix = f'm{i}_'
prefix = 'm{0}_'.format(i)
model = getattr(models, basis_func['type'])(prefix=prefix)
if basis_func['type'] in ['LognormalModel','GaussianModel', 'LorentzianModel', 'VoigtModel']: # for now VoigtModel has gamma constrained to sigma
model.set_param_hint('sigma', min=1e-6, max=x_range)
model.set_param_hint('center', min=x_min, max=x_max)
model.set_param_hint('height', min=1e-6, max=1.1*y_max)
model.set_param_hint('amplitude', min=1e-6)
# default guess is horrible!! do not use guess()
default_params = {
prefix+'center': x_min + x_range * random.random(),
prefix+'height': y_max * random.random(),
prefix+'sigma': x_range * random.random()
}
else:
# raise NotImplemented(f'model {basis_func["type"]} not implemented yet')
raise NotImplemented('model {0} not implemented yet'.format(basis_func["type"]))
if 'help' in basis_func: # allow override of settings in parameter
for param, options in basis_func['help'].items():
model.set_param_hint(param, **options)
model_params = model.make_params(**default_params, **basis_func.get('params', {}))
if params is None:
params = model_params
else:
params.update(model_params)
if composite_model is None:
composite_model = model
else:
composite_model = composite_model + model
return composite_model, params
def update_spec_from_peaks(spec, model_indicies, peak_widths=np.arange(1,10), **kwargs):
x = spec['x']
y = spec['y']
x_range = np.max(x) - np.min(x)
peak_indicies = signal.find_peaks_cwt(y, peak_widths)
np.random.shuffle(peak_indicies)
# for peak_indicie, model_indicie in zip(peak_indicies.tolist(), model_indicies):
for peak_indicie, model_indicie in zip(peak_indicies, model_indicies):
model = spec['model'][model_indicie]
if model['type'] in ['LognormalModel','GaussianModel', 'LorentzianModel', 'VoigtModel']:
params = {
'height': y[peak_indicie],
'sigma': x_range / len(x) * np.min(peak_widths),
'center': x[peak_indicie]
}
if 'params' in model:
model.update(params)
else:
model['params'] = params
else:
# raise NotImplemented(f'model {basis_func["type"]} not implemented yet')
raise NotImplemented('model {0} not implemented yet'.format(model["type"]))
return peak_indicies
Here is the mainline:
spec = {
'x': sizes,
'y': y_exp,
'model': [
{
'type': 'LognormalModel',
'params': {'center': 20, 'height': 3, 'sigma': 1},
# 'help': {'center': {'min': 10, 'max': 30}}
}]}
num_comp = list(range(0,len(spec['model'])))
peaks_found = update_spec_from_peaks(spec, num_comp, peak_widths=np.arange(1,10))
#For checking peak fitting
print(peaks_found)
fig, ax = plt.subplots()
ax.scatter(spec['x'], spec['y'], s=4)
for i in peaks_found:
ax.axvline(x=spec['x'][i], c='black', linestyle='dotted')
model, params = generate_model(spec)
output = model.fit(spec['y'], params, x=spec['x'])
fig, gridspec = output.plot()
Your script has quite a bit extra stuff. Stripping down to essentials would give:
import numpy as np
from lmfit import models
import matplotlib.pyplot as plt
x = np.array([ 1.26500000e-01, 1.47000000e-01, 1.71500000e-01,
2.00000000e-01, 2.33000000e-01, 2.72000000e-01,
3.17000000e-01, 3.69500000e-01, 4.31000000e-01,
5.02500000e-01, 5.86000000e-01, 6.83500000e-01,
7.97000000e-01, 9.29000000e-01, 1.08300000e+00,
1.26250000e+00, 1.47200000e+00, 1.71650000e+00,
2.00100000e+00, 2.33300000e+00, 2.72050000e+00,
3.17200000e+00, 3.69800000e+00, 4.31150000e+00,
5.02700000e+00, 5.86100000e+00, 6.83300000e+00,
7.96650000e+00, 9.28850000e+00, 1.08295000e+01,
1.26265000e+01, 1.47215000e+01, 1.71640000e+01,
2.00115000e+01, 2.33315000e+01, 2.72030000e+01,
3.17165000e+01, 3.69785000e+01, 4.31135000e+01,
5.02665000e+01, 5.86065000e+01, 6.83300000e+01,
7.96670000e+01, 9.28850000e+01, 1.08296000e+02,
1.26264000e+02, 1.47213000e+02, 1.71637500e+02,
2.00114500e+02, 2.33316500e+02])
y = np.array([ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.01, 0.02,
0.03, 0.04, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.13, 0.19,
0.3 , 0.48, 0.74, 1.1 , 1.56, 2.11, 2.72, 3.37, 3.99, 4.55,
4.99, 5.3 , 5.48, 5.53, 5.48, 5.36, 5.19, 4.97, 4.67, 4.28,
3.79, 3.18, 2.48, 1.73, 1. , 0.35, 0. , 0. , 0. , 0. ])
model = models.LognormalModel()
params = model.make_params(center=20, sigma=3, amplitude=5)
result = model.fit(y, params, x=x)
print(result.fit_report())
plt.plot(x, y, label='data')
plt.plot(x, result.best_fit, label='fit')
plt.legend()
plt.show()
This runs and gives a decent if not quite perfect fit.
In general, I would discourage you from setting "parameter hints" based on data ranges. Use the ability to set such limits sparingly and only where they are inherent to the model (for example that sigma<0
makes no sense).
I have no idea what your code to use random numbers to set initial values, but it sure looks to me like it is likely to set initial values that extremely poor choices.