pythoncurve-fittinguncertainty

How to get errors on a cubic spline interpolation (Python; splrep, splev)


I am performing a weighted cubic spline fit to some data x, y, and errors on y. I then want to interpolate other values and their errors along that curve. I am having a very difficult time figuring out how to do the error propagation on each interpolated point and could use some help.

My current approach:

import sys
import numpy as np
from PySide6.QtWidgets import QApplication, QMainWindow, QVBoxLayout, QWidget
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
from scipy.interpolate import splrep, splev

class MplCanvas(FigureCanvas):
    def __init__(self, parent=None, width=5, height=4, dpi=100):
        fig = Figure(figsize=(width, height), dpi=dpi)
        self.axes = fig.add_subplot(111)
        super().__init__(fig)

class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()

        self.canvas = MplCanvas(self, width=5, height=4, dpi=100)
        layout = QVBoxLayout()
        layout.addWidget(self.canvas)
        container = QWidget()
        container.setLayout(layout)
        self.setCentralWidget(container)

        self.plot()

    def plot(self):
        # Generate random data with errors
        np.random.seed(0)
        x = np.linspace(0, 10, 10)
        y = np.sin(x) + np.random.normal(0, 0.1, len(x))
        y_err = np.random.normal(0.1, 0.02, len(x)) + 0.5

        # Fit a weighted cubic spline using splrep
        tck = splrep(x, y, w=1/y_err, k=3)

        # Interpolate values using splev
        x_interp = np.linspace(0, 10, 100)
        y_interp = splev(x_interp, tck)

        # Calculate the fitted values at the data points (for residuals)
        y_fit = splev(x, tck)

        # Calculate residuals
        residuals = y - y_fit

        # Calculate chi-squared and degrees of freedom
        chi2 = np.sum((residuals / y_err) ** 2)
        dof = len(x) - len(tck[1]) // 3 - 1

        # Covariance matrix for weighted least squares
        cov_matrix = np.diag(y_err ** 2) * chi2 / dof

        # Compute Jacobians at interpolated points
        jacobian = np.array([splev(xi, tck, der=1) for xi in x_interp])

        # Extract the diagonal elements of the covariance matrix (variances)
        variances = np.diag(cov_matrix)

        # Convert variances to column vector for broadcasting
        variances_column = variances[:, np.newaxis]

        # Propagate errors through the squared Jacobian and sum up
        jacobian_squared = jacobian ** 2
        weighted_jacobian = jacobian_squared * variances_column
        sum_weighted_jacobian = np.sum(weighted_jacobian, axis=0)

        # Calculate the final interpolated errors
        y_interp_err = np.sqrt(sum_weighted_jacobian)

        # Plot the results
        self.canvas.axes.errorbar(x, y, yerr=y_err, fmt='o', label='Data')
        self.canvas.axes.plot(x_interp, y_interp, label='Cubic Spline Fit')
        self.canvas.axes.fill_between(x_interp, y_interp - y_interp_err, y_interp + y_interp_err, alpha=0.2, label='Error')
        self.canvas.axes.legend()
        self.canvas.draw()

app = QApplication(sys.argv)
window = MainWindow()
window.show()
sys.exit(app.exec())

This generates points where the errors are zero, e.g.:

a plot

which looks incorrect. What am I doing wrong?


Solution

  • Why?

    It seems you are trying to propagate y errors using gradient wrt x variable Jx = dy/dx to estimate y errors. This is somehow a circular procedure and it is at least dimensionally incorrect.

    The reason why your "error" function is zeroing is simply due to zero of the gradient when the model has zero slope (you can check it visually zero happens when model slope is null).

    How?

    What you are probably looking for is parameters error propagation (see this reference), in this scenario we propagate parameters uncertainty using the gradient Jp with the classical formulae: Cy = Jp @ Cp @ Jp.T. That is, it will provide uncertainty on y based on error propagation of the parameters p against our model.

    Unfortunately this is not straightforward using BSpline but we can perform it using curve_fit which provides parameters covariance estimate alongside with fitted parameters.

    I leave a MCVE below to show how to perform such parametric propagation.

    If you are looking for a non parametric procedure, you should probably have a look to Gaussian Process Regression which is nicely implemented in sklearn.

    MCVE

    import numpy as np
    import numdifftools as nd
    from scipy import interpolate, optimize, stats
    import matplotlib.pyplot as plt
    

    We create the model with some ad hoc parameters:

    def model(x, a, b, c):
        return a * np.sin(b * x) + c
    

    We generate some synthetic dataset with known uncertainties:

    np.random.seed(0)
    m = 10
    x = np.linspace(0, 10, m)
    s = 0.5 * np.ones(x.size)
    n = s * np.random.normal(size=x.size)
    p = (1., 1., 0.)
    y = model(x, *p)
    yn = y + n
    

    We assess the exact model:

    xlin = np.linspace(x.min(), x.max(), 200)
    ylin = model(xlin, *p)
    

    We can smooth the dataset using BSpline (for comparison sake):

    tck = interpolate.splrep(x, yn, w=1/s, k=3)
    spline = interpolate.BSpline(*tck)
    yspl = spline(xlin)
    

    We adjust the model with curve_fit:

    popt, pcov = optimize.curve_fit(model, x, yn, sigma=s, absolute_sigma=True)
    yhat = model(xlin, *popt)
    

    Then, using the parameters covariance matrix, we can assess error on y:

    def variance(model, x, p, Cp):
        
        def proxy(q):
            return model(x, *q)
        
        def projection(J):
            return J @ Cp @ J.T
        
        Jp = nd.Gradient(proxy)(p)
        Cy = np.apply_along_axis(projection, 1, Jp)
        
        return Cy
    

    Finally we assess the Confidence Bands:

    alpha = 0.01
    z = stats.norm.ppf(1 - alpha / 2.)
    Cy = variance(model, xlin, popt, pcov)
    sy = np.sqrt(Cy)
    ci = z * sy
    

    Plotting everything to check:

    fig, axe = plt.subplots()
    axe.errorbar(x, yn, yerr=s, linestyle="none", marker=".", label="Data")
    axe.plot(xlin, ylin, label="Model")
    axe.plot(xlin, yspl, label="BSpline")
    axe.plot(xlin, yhat, label="NLLS")
    axe.fill_between(xlin, yhat - ci, yhat + ci, alpha=0.2, label="CI 1 %")
    axe.set_title("Error Propagation")
    axe.set_xlabel("Variable $x$")
    axe.set_ylabel("Variable $y$")
    axe.legend()
    axe.grid()
    

    enter image description here

    Where the confidence bands belong to the red curve (NLLS).