pythonmachine-learningcalibration

Code for probability calibration for classification


I am trying to create a class for calibrating a classifier. I have been reading resources on probability calibration and I am a bit confused on which dataset should we calibrate the classifier. I created a class that split the training set to further train and validation the set. Then, the classifier is first fitted to the train set and predicts the uncalibrated probability on the validation set.

Then, I create a cal_model instance of the CalibrationCV class and then fit it to the validation set and predict calibrated probabilities of the validation set again.

Could someone take a look at the code below and correct the code for me?

class calibrate_model:
    """
    A class that will split the training dataset to both train and validation set and then does
    probability calibration.
    
    model = Classification model
    Xtrain = Independent feature set
    ytrain = target variable set
    cv = cross validation method
    cal_method = 'sigmoid' or 'isotonic'.
    
    """
    def __init__(self, model, Xtrain, ytrain, cv, cal_method):
        self.model = model
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.cv = cv
        self.cal_method = cal_method
        
    def calibrate_probability(self):
        
        from sklearn.model_selection import train_test_split
        from sklearn.calibration import CalibratedClassifierCV
        from sklearn.calibration import calibration_curve
        
        train_X, val_X, train_y, val_y = train_test_split(self.Xtrain, 
                                                          self.ytrain, 
                                                          test_size = 0.2, 
                                                          random_state = seed)
        
        
        #uncalibrated model
        
        for train_index, test_index in self.cv.split(train_X, train_y): 
            X_train_kfold, X_val_kfold = train_X[train_index], train_X[test_index] 
            y_train_kfold, y_val_kfold = train_y[train_index], train_y[test_index] 
            self.model.fit(X_train_kfold, y_train_kfold)
            
        uc_probs = self.model.predict_proba(val_X)[:, 1]
        uc_fop, uc_mpv = calibration_curve(val_y, uc_probs, n_bins=10, normalize=True, 
                                           strategy = 'quantile')
    

        #Calibrating Model
        self.cal_model = CalibratedClassifierCV(self.model, method=self.cal_method, cv=self.cv)
        self.cal_model.fit(val_X, val_y)
        
        # predict probabilities
        c_probs = self.cal_model.predict_proba(val_X)[:, 1]
        
        # reliability diagram
        c_fop, c_mpv = calibration_curve(val_y, c_probs, n_bins=10, normalize=True,
                                        strategy = 'quantile')

        # plot CATBOOST calibrated
        plt.plot([0, 1], [0, 1], linestyle='--');

        # plot un calibrated model reliability 
        plt.plot(uc_mpv, uc_fop, marker='.', label = 'Uncalibrated');

        # plot calibrated reliability
        plt.plot(c_mpv, c_fop, marker='.', label = 'Calibrated');

        plt.title(type(self.model).__name__ + ' ' + self.cal_method)
        plt.ylabel('Fraction of Positives (fop)')
        plt.xlabel('Mean Predicted Value (mpv)')
        plt.legend();
        plt.tight_layout()

Solution

  • the calibration_curve code is correct. I am comparing the logistic regression calibration versus the xgboost calibration. the dataframes hold predict_proba[:,1] values or the probability of happening. see (https://github.com/dnishimoto/python-deep-learning/blob/master/Credit%20Loan%20Risk%20.ipynb)

     y_pred_prob_lr=pipeline['lr'].predict_proba(X_test)
     y_preds_proba_lr_df=pd.DataFrame(y_pred_prob_lr[:,1],columns= 
     ["pred_default_proba"])
    
     xg_cl= 
     xgb.XGBClassifier(objective='binary:logistic',n_estimators=10,seed=123)
    
     xg_cl.fit(X_train,y_train)
    
      y_pred_xg=xg_cl.predict(X_test)
      y_pred_proba_xg=xg_cl.predict_proba(X_test)
    
      y_preds_proba_xg_df = pd.DataFrame(y_pred_proba_xg[:,1], columns = 
      ['prob_default'])
    
    
      frac_of_pos, mean_pred_val = calibration_curve(y_test,preds_proba_df , n_bins=10, normalize=True,
                                        strategy = 'quantile')
    
      frac_of_pos_lr, mean_pred_val_lr = calibration_curve(y_test,y_pred_prob_lr_df , n_bins=10, normalize=True,
                                        strategy = 'quantile')
    
    
    
      plt.plot([0, 1], [0, 1], 'k:', label="Perfectly calibrated")    
      plt.plot(mean_pred_val, frac_of_pos,
         's-', label='%s' % 'XGBoost Regression')
    
      plt.plot(mean_pred_val_lr, frac_of_pos_lr,
         's-', label='%s' % 'Logistic Regression')
    
      plt.xlabel('Fraction of positives')
      plt.ylabel('Average Predicted Probability')
      plt.legend()
    
      plt.title('Calibration Curve')
      plt.show()