pythonmatplotlibstatsmodelspredictionconfidence-interval

How to Calculate and Plot Prediction and Confidence Intervals for Linear Regression


I need to plot prediction and confidence intervals and need to use python and only the following packages. How do I plot both intervals in the same model. I managed to plot the prediction intervals with the help of ChatGPT. this is the code including the data set.

import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#dataframe
data = {
    'X': [55641, 55681, 55637, 55825, 55772, 55890, 56068, 56299, 56825, 57205, 
          57562, 57850, 57975, 57992, 58240, 58414, 58561, 59066, 58596, 58631, 
          58758, 59037],
    'Y': [21886, 21934, 21699, 21901, 21812, 21714, 21932, 22086, 22265, 22551, 
          22736, 22301, 22518, 22580, 22618, 22890, 23112, 23315, 22865, 22788, 
          22949, 23149]
}

df = pd.DataFrame(data)

#OLS
model = smf.ols(formula='Y ~ X', data=df)
results = model.fit()

print(results.summary())

#calculating prediction intevals
predictions = results.get_prediction(df)
prediction_summary_frame = predictions.summary_frame(alpha=0.05)

#data points
plt.scatter(df['X'], df['Y'], color='black', label='Data')

#regression line
plt.plot(df['X'], results.fittedvalues, color='#58C9F4', label='Regression Line')

#prediction invterval
plt.fill_between(df['X'], prediction_summary_frame['obs_ci_lower'], prediction_summary_frame['obs_ci_upper'], color='grey', alpha=0.2, label='95% Prediction Interval')

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Linear Regression with Prediction Intervals')
plt.legend()
plt.show()

How should I continue to be able to plot both intervals?


Solution

  • Explanation:

    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # Dataframe
    data = {
        'X': [55641, 55681, 55637, 55825, 55772, 55890, 56068, 56299, 56825, 57205, 
              57562, 57850, 57975, 57992, 58240, 58414, 58561, 59066, 58596, 58631, 
              58758, 59037],
        'Y': [21886, 21934, 21699, 21901, 21812, 21714, 21932, 22086, 22265, 22551, 
              22736, 22301, 22518, 22580, 22618, 22890, 23112, 23315, 22865, 22788, 
              22949, 23149]
    }
    
    df = pd.DataFrame(data)
    
    # OLS
    model = smf.ols(formula='Y ~ X', data=df)
    results = model.fit()
    
    print(results.summary())
    
    # Calculating prediction intervals
    predictions = results.get_prediction(df)
    prediction_summary_frame = predictions.summary_frame(alpha=0.05)
    
    # Calculating confidence intervals
    confidence_intervals = results.conf_int(alpha=0.05)
    
    # Data points
    ax = df.plot(kind='scatter', x='X', y='Y', color='black', label='Data')
    
    # Regression line
    ax.plot(df['X'], results.fittedvalues, color='#58C9F4', label='Regression Line')
    
    # Prediction interval
    ax.fill_between(df['X'], prediction_summary_frame['obs_ci_lower'], prediction_summary_frame['obs_ci_upper'], color='grey', alpha=0.2, label='95% Prediction Interval')
    
    # Confidence interval
    ax.fill_between(df['X'], prediction_summary_frame['mean_ci_lower'], prediction_summary_frame['mean_ci_upper'], color='blue', alpha=0.2, label='95% Confidence Interval')
    
    ax.set(xlabel='X', ylabel='Y', title='Linear Regression with Prediction and Confidence Intervals')
    ax.legend()
    plt.show()
    
                                OLS Regression Results                            
    ==============================================================================
    Dep. Variable:                      Y   R-squared:                       0.919
    Model:                            OLS   Adj. R-squared:                  0.915
    Method:                 Least Squares   F-statistic:                     227.5
    Date:                Tue, 11 Jun 2024   Prob (F-statistic):           2.17e-12
    Time:                        06:30:58   Log-Likelihood:                -140.06
    No. Observations:                  22   AIC:                             284.1
    Df Residuals:                      20   BIC:                             286.3
    Df Model:                           1                                         
    Covariance Type:            nonrobust                                         
    ==============================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
    ------------------------------------------------------------------------------
    Intercept    559.4600   1450.698      0.386      0.704   -2466.642    3585.562
    X              0.3815      0.025     15.084      0.000       0.329       0.434
    ==============================================================================
    Omnibus:                        0.314   Durbin-Watson:                   1.479
    Prob(Omnibus):                  0.855   Jarque-Bera (JB):                0.390
    Skew:                          -0.242   Prob(JB):                        0.823
    Kurtosis:                       2.562   Cond. No.                     2.64e+06
    ==============================================================================
    
    Notes:
    [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
    [2] The condition number is large, 2.64e+06. This might indicate that there are
    strong multicollinearity or other numerical problems.
    

    enter image description here


    In the original code, the intercept at x = 0 is correct, but to visually represent it, the x-axis limits need to be expanded. Additionally, the slope appears distorted due to an unequal aspect ratio between the X and Y axes, causing a misrepresentation of the linear relationship.

    # OLS
    model = smf.ols(formula='Y ~ X', data=df)
    results = model.fit()
    
    # Extract coefficients
    intercept = results.params['Intercept']
    slope = results.params['X']
    
    # Calculate regression line values
    x_values = np.linspace(0, 59037, 100)
    y_values = intercept + slope * x_values
    
    # Calculating prediction intervals
    predictions = results.get_prediction(df)
    prediction_summary_frame = predictions.summary_frame(alpha=0.05)
    
    # Calculating confidence intervals
    confidence_intervals = results.conf_int(alpha=0.05)
    
    # Data points
    ax = df.plot(kind='scatter', x='X', y='Y', color='black', label='Data', s=1, figsize=(12, 10))
    
    # Regression line
    ax.plot(x_values, y_values, color='#58C9F4', label='Regression Line')
    
    # Adding the point where the regression line intersects the y-axis
    ax.scatter(0, intercept, color='red', label='Y-intercept (x=0)', s=2)
    ax.annotate(f'(0, {intercept:.2f})', xy=(0, intercept), xytext=(2000, intercept + 1000), arrowprops=dict(facecolor='black', shrink=0.05))
    
    # Annotating the regression line with the slope
    ax.text(x_values[len(x_values)//2], y_values[len(y_values)//2], f'Slope: {slope:.2f}', fontsize=12, verticalalignment='bottom')
    
    # Prediction interval
    ax.fill_between(df['X'], prediction_summary_frame['obs_ci_lower'], prediction_summary_frame['obs_ci_upper'], color='grey', alpha=0.2, label='95% Prediction Interval')
    
    # Confidence interval
    ax.fill_between(df['X'], prediction_summary_frame['mean_ci_lower'], prediction_summary_frame['mean_ci_upper'], color='blue', alpha=0.2, label='95% Confidence Interval')
    
    # Setting equal aspect ratio
    ax.set_aspect('equal', 'box')
    
    ax.set(xlabel='X', ylabel='Y', title='Linear Regression with Prediction and Confidence Intervals')
    ax.legend(bbox_to_anchor=(1, 0.5), loc='center left', frameon=False)
    plt.show()
    

    enter image description here