I am trying to create a class for calibrating a classifier. I have been reading resources on probability calibration and I am a bit confused on which dataset should we calibrate the classifier. I created a class that split the training set to further train and validation the set. Then, the classifier is first fitted to the train set and predicts the uncalibrated probability on the validation set.
Then, I create a cal_model instance of the CalibrationCV class and then fit it to the validation set and predict calibrated probabilities of the validation set again.
Could someone take a look at the code below and correct the code for me?
class calibrate_model:
"""
A class that will split the training dataset to both train and validation set and then does
probability calibration.
model = Classification model
Xtrain = Independent feature set
ytrain = target variable set
cv = cross validation method
cal_method = 'sigmoid' or 'isotonic'.
"""
def __init__(self, model, Xtrain, ytrain, cv, cal_method):
self.model = model
self.Xtrain = Xtrain
self.ytrain = ytrain
self.cv = cv
self.cal_method = cal_method
def calibrate_probability(self):
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve
train_X, val_X, train_y, val_y = train_test_split(self.Xtrain,
self.ytrain,
test_size = 0.2,
random_state = seed)
#uncalibrated model
for train_index, test_index in self.cv.split(train_X, train_y):
X_train_kfold, X_val_kfold = train_X[train_index], train_X[test_index]
y_train_kfold, y_val_kfold = train_y[train_index], train_y[test_index]
self.model.fit(X_train_kfold, y_train_kfold)
uc_probs = self.model.predict_proba(val_X)[:, 1]
uc_fop, uc_mpv = calibration_curve(val_y, uc_probs, n_bins=10, normalize=True,
strategy = 'quantile')
#Calibrating Model
self.cal_model = CalibratedClassifierCV(self.model, method=self.cal_method, cv=self.cv)
self.cal_model.fit(val_X, val_y)
# predict probabilities
c_probs = self.cal_model.predict_proba(val_X)[:, 1]
# reliability diagram
c_fop, c_mpv = calibration_curve(val_y, c_probs, n_bins=10, normalize=True,
strategy = 'quantile')
# plot CATBOOST calibrated
plt.plot([0, 1], [0, 1], linestyle='--');
# plot un calibrated model reliability
plt.plot(uc_mpv, uc_fop, marker='.', label = 'Uncalibrated');
# plot calibrated reliability
plt.plot(c_mpv, c_fop, marker='.', label = 'Calibrated');
plt.title(type(self.model).__name__ + ' ' + self.cal_method)
plt.ylabel('Fraction of Positives (fop)')
plt.xlabel('Mean Predicted Value (mpv)')
plt.legend();
plt.tight_layout()
the calibration_curve code is correct. I am comparing the logistic regression calibration versus the xgboost calibration. the dataframes hold predict_proba[:,1] values or the probability of happening. see (https://github.com/dnishimoto/python-deep-learning/blob/master/Credit%20Loan%20Risk%20.ipynb)
y_pred_prob_lr=pipeline['lr'].predict_proba(X_test)
y_preds_proba_lr_df=pd.DataFrame(y_pred_prob_lr[:,1],columns=
["pred_default_proba"])
xg_cl=
xgb.XGBClassifier(objective='binary:logistic',n_estimators=10,seed=123)
xg_cl.fit(X_train,y_train)
y_pred_xg=xg_cl.predict(X_test)
y_pred_proba_xg=xg_cl.predict_proba(X_test)
y_preds_proba_xg_df = pd.DataFrame(y_pred_proba_xg[:,1], columns =
['prob_default'])
frac_of_pos, mean_pred_val = calibration_curve(y_test,preds_proba_df , n_bins=10, normalize=True,
strategy = 'quantile')
frac_of_pos_lr, mean_pred_val_lr = calibration_curve(y_test,y_pred_prob_lr_df , n_bins=10, normalize=True,
strategy = 'quantile')
plt.plot([0, 1], [0, 1], 'k:', label="Perfectly calibrated")
plt.plot(mean_pred_val, frac_of_pos,
's-', label='%s' % 'XGBoost Regression')
plt.plot(mean_pred_val_lr, frac_of_pos_lr,
's-', label='%s' % 'Logistic Regression')
plt.xlabel('Fraction of positives')
plt.ylabel('Average Predicted Probability')
plt.legend()
plt.title('Calibration Curve')
plt.show()