python scipy curve-fitting scipy-optimize approximation

Why my approximation in scipy curve_fit is so bad?

Before approximating the data of my calculations, I decided to test the code on data from Q. Zhang, N. Sabelli, V. Buch; Potential energy surface of H⋅⋅⋅H2O. J. Chem. Phys. 15 July 1991; 95 (2): 1080–1085, an old article in the public domain, and ran into a problem: my approximation by the same function as in the article turns out to be very far from both the data and the approximation from the article, it does not even have a minimum, which is very important for this range of tasks

When plotting the article approximation graph, I discarded the first point for each geometry (except for the last one, both the article approximation and mine are bad there), because the energy is very large (apparently in order to have a good correspondence in the minimum area) and the graph turns out to be useless.

Cloud with data.

from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import scipy
from numpy import array

data = pd.read_excel('data.xlsx', header=None)
values_E = array(data[3])
values_R = array(data[0])
values_theta = array([math.radians(data[1][i]) for i in range(len(data[1]))])
values_phi = array([math.radians(data[2][i]) for i in range(len(data[2]))])
values = array([values_R, values_theta, values_phi])

def mapping(values, eps00, eps10, eps20, eps2_2, sigma00, sigma10, sigma20, sigma2_2):
    return [math.fsum([
    4*eps00*(sigma00/values[0][i])**6*((sigma00/values[0][i])**6 - 1)* 1/2 * np.sqrt(1/np.pi),
    4*eps10*(sigma10/values[0][i])**6*((sigma10/values[0][i])**6 - 1)* 1/2 * np.sqrt(3/np.pi) * np.cos(values[1][i]),
    4*eps20*(sigma20/values[0][i])**6*((sigma20/values[0][i])**6 - 1)* 1/4 * np.sqrt(5/np.pi) * (3 * (np.cos(values[1][i]))**2 - 1),
    4*eps2_2*(sigma2_2/values[0][i])**6*((sigma2_2/values[0][i])**6 - 1)* 1/4 * np.sqrt(15/np.pi) * (np.sin(values[1][i]))**2 * (np.sin(values[2][i]))**2
    ]) for i in range(len(values_R))]
param_bounds=([0,-20, 0, 0, 0, 0, 0, 0], [150,10,10,100,10,10,10,10])
args, _ = curve_fit(mapping, values, values_E, bounds=param_bounds, maxfev=10000000)
eps00, eps10, eps20, eps2_2, sigma00, sigma10, sigma20, sigma2_2 = args

#energy by approximation from the article and by my
article = mapping(values, 107.9, -9.5, 8.6, 35.6, 3.0, 3.3, 2.98, 2.92) 
new_E = mapping(values, eps00, eps10, eps20, eps2_2, sigma00, sigma10, sigma20, sigma2_2)

sp = plt.subplot(231) #90_00
plt.plot(values_R[0:14], values_E[0:14], label='E_90_00')
plt.plot(values_R[1:14], article[1:14], label='article')
plt.plot(values_R[0:14], new_E[0:14], label='new_E')
plt.xlabel('R_90_00')
plt.ylabel('E_90_00')
plt.legend(loc='best')

sp = plt.subplot(232) #90_90
plt.plot(values_R[14:28], values_E[14:28], label='E_90_90')
plt.plot(values_R[15:28], article[15:28], label='article')
plt.plot(values_R[14:28], new_E[14:28], label='new_E')
plt.xlabel('R_90_90')
plt.ylabel('E_90_90')
plt.legend(loc='best')

sp = plt.subplot(233) #00_00
plt.plot(values_R[28:42], values_E[28:42], label='E_00_00')
plt.plot(values_R[29:42], article[29:42], label='article')
plt.plot(values_R[28:42], new_E[28:42], label='new_E')
plt.xlabel('R_00_00')
plt.ylabel('E_00_00')
plt.legend(loc='best')

sp = plt.subplot(234) #180_00
plt.plot(values_R[42:56], values_E[42:56], label='E_180_00')
plt.plot(values_R[43:56], article[43:56], label='article')
plt.plot(values_R[42:56], new_E[42:56], label='new_E')
plt.xlabel('R_180_00')
plt.ylabel('E_180_00')
plt.legend(loc='best')

sp = plt.subplot(235) #127_90
plt.plot(values_R[56:62], values_E[56:62], label='E_127_90')
plt.plot(values_R[56:62], article[56:62], label='article')
plt.plot(values_R[56:62], new_E[56:62], label='new_E')
plt.xlabel('R_127_90')
plt.ylabel('E_127_90')
plt.legend(loc='best')

plt.show()

I checked the input and rewrote the mapping to a better look and made very precise restrictions on the parameters, which will be impossible when you don’t know what should turn out, but it’s all in vain. I did not describe the original problem and the article because at the moment everything has come down to a problem with the code.

The curve of my energies should coincide with the curve of the data from the article or at least on those approximated in it.

Solution

Plenty of errors of varying severity:

when you read into a Pandas dataframe from this spreadsheet, you need to provide column names; don't use positional column indices
don't use math anywhere
mapping is not an appropriate name for your function when considering the terminology of the article
fsum is the wrong place to care about accuracy and should go away
general failure to vectorise
failure to properly use dataframe grouping in your plots

But all of that still doesn't account for most of the error in your fit. My educated guess is as follows:

Note that in the article in Table III, all angle groups are evaluated for radii from 2 through 6 angstroms except the last group of 127.74° / 90°. I believe it to be no coincidence that their fit for that group is much better than yours and their fit for all other groups is worse than yours. I think what happened is that their fit was applied to a larger angle group than what they printed in the article, or, they applied different fitting error weights than you did such that they normalize by the amplitude of each group.

This theory is corroborated by the fact that if you only fit to this group, you see an effect similar to theirs: the fit for that group gets better and the fit for every other group has a starting peak that is too high.

from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

data = pd.read_csv('data.csv', header=None, names=('R', 'theta', 'phi', 'E'), decimal=',')
data[['theta_rad', 'phi_rad']] = np.deg2rad(data[['theta', 'phi']])
values = data[['R', 'theta_rad', 'phi_rad']].values.T


def Vfit(
    values: np.ndarray,
      eps0p0: float,   eps1p0: float,   eps2p0: float,   eps2n2: float,
    sigma0p0: float, sigma1p0: float, sigma2p0: float, sigma2n2: float,
) -> np.ndarray:
    R, theta, phi = values

    cos_theta = np.cos(theta)
    Y1p0 = 0.50*np.sqrt( 3/np.pi) * cos_theta
    Y2p0 = 0.25*np.sqrt( 5/np.pi) * (3*cos_theta**2 - 1)
    Y2n2 = 0.25*np.sqrt(15/np.pi) * np.sin(theta)**2 * np.sin(phi)**2
    Y0p0 = np.full_like(Y1p0, fill_value=0.50*np.sqrt(1/np.pi))  # 1*pi?

    Y = np.array((Y0p0, Y1p0, Y2p0, Y2n2))
    eps = (eps0p0,), (eps1p0,), (eps2p0,), (eps2n2,),
    sigma = (sigma0p0,), (sigma1p0,), (sigma2p0,), (sigma2n2,),

    ratio = sigma/R
    product = 4*Y*eps*(ratio**12 - ratio**6)
    return product.sum(axis=0)


is_fit = (data.phi.astype(int) == 90) & (data.theta.astype(int) == 127)

args, _ = curve_fit(
    f=Vfit, xdata=values[:, is_fit], ydata=data.E[is_fit],
    bounds=(
        # epsilon        sigma
        (  0,-20, 0,  0,  0, 0, 0, 0),
        (150, 10,10,100, 10,10,10,10),
    ),
    p0=(110, -10, 10, 30, 3, 3, 3, 3),
    maxfev=10_000,
)
eps = args[:4]
sigma = args[4:]
print('Fit epsilon:', eps)
print('Fit sigma:', sigma)

params_fbd = 107.9, -9.5, 8.6, 35.6, 3.0, 3.3, 2.98, 2.92
article = Vfit(values, *params_fbd)
fit_E = Vfit(values, *eps, *sigma)

fig, ax_rows = plt.subplots(nrows=2, ncols=3)
all_axes = [ax for row in ax_rows for ax in row]
for ax, ((theta, phi), group) in zip(all_axes, data.groupby(['theta', 'phi'])):
    ax.plot(group.R, group.E, label='E')
    ax.plot(group.R, article[group.index], label='article')
    ax.plot(group.R, fit_E[group.index], label='fit_E')
    ax.set_xlabel('R (Å)')
    ax.set_ylabel('E (cm⁻¹)')
    ax.set_title(f'theta={theta} phi={phi}')
    ax.legend()

plt.show()

Fit epsilon: [113.92910577 -11.10362669   9.99792662  38.44309601]
Fit sigma: [3.09518901 3.09459379 3.09304591 3.09410421]