python matplotlib statsmodels prediction confidence-interval

How to Calculate and Plot Prediction and Confidence Intervals for Linear Regression

I need to plot prediction and confidence intervals and need to use python and only the following packages. How do I plot both intervals in the same model. I managed to plot the prediction intervals with the help of ChatGPT. this is the code including the data set.

import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#dataframe
data = {
    'X': [55641, 55681, 55637, 55825, 55772, 55890, 56068, 56299, 56825, 57205, 
          57562, 57850, 57975, 57992, 58240, 58414, 58561, 59066, 58596, 58631, 
          58758, 59037],
    'Y': [21886, 21934, 21699, 21901, 21812, 21714, 21932, 22086, 22265, 22551, 
          22736, 22301, 22518, 22580, 22618, 22890, 23112, 23315, 22865, 22788, 
          22949, 23149]
}

df = pd.DataFrame(data)

#OLS
model = smf.ols(formula='Y ~ X', data=df)
results = model.fit()

print(results.summary())

#calculating prediction intevals
predictions = results.get_prediction(df)
prediction_summary_frame = predictions.summary_frame(alpha=0.05)

#data points
plt.scatter(df['X'], df['Y'], color='black', label='Data')

#regression line
plt.plot(df['X'], results.fittedvalues, color='#58C9F4', label='Regression Line')

#prediction invterval
plt.fill_between(df['X'], prediction_summary_frame['obs_ci_lower'], prediction_summary_frame['obs_ci_upper'], color='grey', alpha=0.2, label='95% Prediction Interval')

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Linear Regression with Prediction Intervals')
plt.legend()
plt.show()

How should I continue to be able to plot both intervals?

Solution

Explanation:

The summary_frame method provides both the prediction intervals (obs_ci_lower, obs_ci_upper) and the confidence intervals (mean_ci_lower, mean_ci_upper).
The fill_between function is used twice to plot both intervals: once for the prediction interval and once for the confidence interval.

import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Dataframe
data = {
    'X': [55641, 55681, 55637, 55825, 55772, 55890, 56068, 56299, 56825, 57205, 
          57562, 57850, 57975, 57992, 58240, 58414, 58561, 59066, 58596, 58631, 
          58758, 59037],
    'Y': [21886, 21934, 21699, 21901, 21812, 21714, 21932, 22086, 22265, 22551, 
          22736, 22301, 22518, 22580, 22618, 22890, 23112, 23315, 22865, 22788, 
          22949, 23149]
}

df = pd.DataFrame(data)

# OLS
model = smf.ols(formula='Y ~ X', data=df)
results = model.fit()

print(results.summary())

# Calculating prediction intervals
predictions = results.get_prediction(df)
prediction_summary_frame = predictions.summary_frame(alpha=0.05)

# Calculating confidence intervals
confidence_intervals = results.conf_int(alpha=0.05)

# Data points
ax = df.plot(kind='scatter', x='X', y='Y', color='black', label='Data')

# Regression line
ax.plot(df['X'], results.fittedvalues, color='#58C9F4', label='Regression Line')

# Prediction interval
ax.fill_between(df['X'], prediction_summary_frame['obs_ci_lower'], prediction_summary_frame['obs_ci_upper'], color='grey', alpha=0.2, label='95% Prediction Interval')

# Confidence interval
ax.fill_between(df['X'], prediction_summary_frame['mean_ci_lower'], prediction_summary_frame['mean_ci_upper'], color='blue', alpha=0.2, label='95% Confidence Interval')

ax.set(xlabel='X', ylabel='Y', title='Linear Regression with Prediction and Confidence Intervals')
ax.legend()
plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      Y   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     227.5
Date:                Tue, 11 Jun 2024   Prob (F-statistic):           2.17e-12
Time:                        06:30:58   Log-Likelihood:                -140.06
No. Observations:                  22   AIC:                             284.1
Df Residuals:                      20   BIC:                             286.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    559.4600   1450.698      0.386      0.704   -2466.642    3585.562
X              0.3815      0.025     15.084      0.000       0.329       0.434
==============================================================================
Omnibus:                        0.314   Durbin-Watson:                   1.479
Prob(Omnibus):                  0.855   Jarque-Bera (JB):                0.390
Skew:                          -0.242   Prob(JB):                        0.823
Kurtosis:                       2.562   Cond. No.                     2.64e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.64e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

In the original code, the intercept at x = 0 is correct, but to visually represent it, the x-axis limits need to be expanded. Additionally, the slope appears distorted due to an unequal aspect ratio between the X and Y axes, causing a misrepresentation of the linear relationship.

# OLS
model = smf.ols(formula='Y ~ X', data=df)
results = model.fit()

# Extract coefficients
intercept = results.params['Intercept']
slope = results.params['X']

# Calculate regression line values
x_values = np.linspace(0, 59037, 100)
y_values = intercept + slope * x_values

# Calculating prediction intervals
predictions = results.get_prediction(df)
prediction_summary_frame = predictions.summary_frame(alpha=0.05)

# Calculating confidence intervals
confidence_intervals = results.conf_int(alpha=0.05)

# Data points
ax = df.plot(kind='scatter', x='X', y='Y', color='black', label='Data', s=1, figsize=(12, 10))

# Regression line
ax.plot(x_values, y_values, color='#58C9F4', label='Regression Line')

# Adding the point where the regression line intersects the y-axis
ax.scatter(0, intercept, color='red', label='Y-intercept (x=0)', s=2)
ax.annotate(f'(0, {intercept:.2f})', xy=(0, intercept), xytext=(2000, intercept + 1000), arrowprops=dict(facecolor='black', shrink=0.05))

# Annotating the regression line with the slope
ax.text(x_values[len(x_values)//2], y_values[len(y_values)//2], f'Slope: {slope:.2f}', fontsize=12, verticalalignment='bottom')

# Prediction interval
ax.fill_between(df['X'], prediction_summary_frame['obs_ci_lower'], prediction_summary_frame['obs_ci_upper'], color='grey', alpha=0.2, label='95% Prediction Interval')

# Confidence interval
ax.fill_between(df['X'], prediction_summary_frame['mean_ci_lower'], prediction_summary_frame['mean_ci_upper'], color='blue', alpha=0.2, label='95% Confidence Interval')

# Setting equal aspect ratio
ax.set_aspect('equal', 'box')

ax.set(xlabel='X', ylabel='Y', title='Linear Regression with Prediction and Confidence Intervals')
ax.legend(bbox_to_anchor=(1, 0.5), loc='center left', frameon=False)
plt.show()