xgboostsmote

Xgboost with Smote on imbalanced data


attached is the code for xgboost on ftir data with smote and smote_weights. the results based on smote is attached as image. From the confusion matrix, i understood that even after applying smote, class 0 is not being utilized in any fold. So i try to include sample weights and end up with following error.

'''
with smote
'''

import time
import numpy as np
import scipy.io as sio
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score, confusion_matrix
from scipy.signal import savgol_filter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns



def evaluate_model_performance(y_true, y_pred, model):
    """
    Evaluate and display the performance of the trained model.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title("Confusion Matrix")
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()
    if hasattr(model, 'feature_importances_') and model.feature_importances_.any():
        plt.figure(figsize=(10, 8))
        xgb.plot_importance(model, max_num_features=10)
        plt.title("Feature importances")
        plt.show()

def display_class_distribution(y):
    """
    Display the distribution of classes in the label array.
    """
    unique, counts = np.unique(y, return_counts=True)
    distribution = pd.DataFrame({'Class': unique, 'Count': counts})
    print(distribution)

def create_class_distribution_df(y):
    unique, counts = np.unique(y, return_counts=True)
    distribution_df = pd.DataFrame({'Class': unique, 'Count': counts})
    distribution_df['Percentage'] = distribution_df['Count'] / distribution_df['Count'].sum() * 100
    # Filter out classes with zero counts
    distribution_df = distribution_df[distribution_df['Count'] > 0]
    return distribution_df

# Start timing
tic = time.perf_counter()

# Data Loading and Preprocessing

print("Loading data...")
Mat = sio.loadmat("/content/drive/MyDrive/Codes/Xgboost_fasa/R_FASA_Binned_spectra_all.mat")
spectraTissue = Mat['spectraTissue']
spectraTissue = savgol_filter(spectraTissue, 5, 4, 0, axis=0)
NHG = [int(x) for x in Mat['label'].T]
y = LabelEncoder().fit_transform(NHG)
patientNo = Mat['patientNo'].T
print("Size of spectraTissue:", spectraTissue.shape)

# Initial Class Distribution
print("Initial class distribution:")
display_class_distribution(y)

# Model Training and Evaluation

train_perf, test_perf, fold_details, f1_train_scores, f1_test_scores, hyperparameters, class_distribution = [], [], [], [], [], [], []
nsplits = 5
skf = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=0)

print("Starting cross-validation with SMOTE...")
for fold, (train_index, test_index) in enumerate(skf.split(spectraTissue, y), start=1):
    X_train, X_test, y_train, y_test = spectraTissue[train_index], spectraTissue[test_index], y[train_index], y[test_index]

    # Pipeline for GridSearchCV with SMOTE
    pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss'))
    ])
    paramspace = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__subsample': [0.5, 0.7, 0.9],
        'classifier__colsample_bytree': [0.5, 0.7, 0.9],
    }
    grid = GridSearchCV(pipeline, param_grid=paramspace, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), scoring='f1_macro', verbose=2)
    mod = grid.fit(X_train, y_train)

    # Apply SMOTE to the training data
    X_resampled, y_resampled = mod.best_estimator_.named_steps['smote'].fit_resample(X_train, y_train)

    # Calculate the class distribution after SMOTE
    resampled_distribution_df = create_class_distribution_df(y_resampled)
    print(f"\nClass distribution after SMOTE in fold {fold}:")
    print(resampled_distribution_df)

    print("Unique classes after SMOTE:", np.unique(y_resampled))

    # Predict and evaluate
    preds_train = mod.predict(X_train)
    preds_test = mod.predict(X_test)
    train_f1, test_f1 = f1_score(y_train, preds_train, average='macro'), f1_score(y_test, preds_test, average='macro')
    f1_train_scores.append(train_f1)
    f1_test_scores.append(test_f1)
    train_perf.append(train_f1)
    test_perf.append(test_f1)
    hyperparameters.append(mod.best_params_)
    class_distribution.append(np.bincount(y_train))

    # Store fold details
    fold_details.append({
        'Fold': fold,
        'Train Size': len(X_train),
        'Test Size': len(X_test),
        'Best Params': mod.best_params_,
        'Train F1 Score': train_f1,
        'Test F1 Score': test_f1
    })

    # Output performance evaluation for the fold
    evaluate_model_performance(y_test, preds_test, mod.best_estimator_)

# Calculate the average class distribution after SMOTE
print("\nAverage class distribution after SMOTE across folds:")
avg_class_distribution = np.mean(class_distribution, axis=0)
distribution_df = create_class_distribution_df(avg_class_distribution.astype(int))

# Display the class distribution
print(distribution_df)

# Save the class distribution to a CSV file
distribution_csv_path = "average_class_distribution_after_smote.csv"
distribution_df.to_csv(distribution_csv_path, index=False)
print(f"Class distribution after SMOTE saved to {distribution_csv_path}")

# Results Visualization

# F1 Score Across Folds Plot
plt.figure(figsize=(10, 5))
folds = list(range(1, nsplits + 1))
plt.plot(folds, f1_train_scores, label='Train F1 Score', marker='o')
plt.plot(folds, f1_test_scores, label='Test F1 Score', marker='o')
plt.title('F1 Score Across Folds')
plt.xlabel('Fold')
plt.ylabel('F1 Score')
plt.legend()
plt.show()

# Save the overall performance and fold details
df_fold_details = pd.DataFrame(fold_details)
df_performance = pd.DataFrame({
    'Fold': folds,
    'Train Performance': train_perf,
    'Test Performance': test_perf,
    'Hyperparameters': [str(params) for params in hyperparameters]
})

df_fold_details.to_csv("R_Fasa_XGB_fold_details_with_smote.csv", index=False)
df_performance.to_csv("R_Fasa_XGB_performance_summary_with_smote.csv", index=False)

print("Model training and evaluation complete.")
toc = time.perf_counter()
print(f"Execution time: {toc - tic:.2f} seconds")

'''
with smote - ROC _ Weights
'''

import time
import numpy as np
import scipy.io as sio
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, auc
from scipy.signal import savgol_filter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight




# Function to evaluate model performance
def evaluate_model_performance(y_true, y_pred, y_proba, model):
    """
    Evaluate and display the performance of the trained model.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Plot confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix")
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    # Plot feature importances if available
    if hasattr(model, 'feature_importances_') and model.feature_importances_.any():
        plt.figure(figsize=(10, 8))
        xgb.plot_importance(model, max_num_features=10)
        plt.title("Feature importances")
        plt.show()

    # ROC-AUC calculations and plotting
    # Compute ROC curve and ROC area for each class
    n_classes = y_proba.shape[1]
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_proba.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # Plot the ROC curve for the micro-average
    plt.figure(figsize=(10, 8))
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]))
    
    # Plot ROC curve for each class
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true == i, y_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                       ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    plt.show()

# Function to display class distribution
def display_class_distribution(y):
    """
    Display the distribution of classes in the label array.
    """
    unique, counts = np.unique(y, return_counts=True)
    distribution = pd.DataFrame({'Class': unique, 'Count': counts})
    distribution['Percentage'] = distribution['Count'] / sum(counts) * 100
    print(distribution)

# Start timing
tic = time.perf_counter()

# Data Loading and Preprocessing
print("Loading data...")
Mat = sio.loadmat("/content/drive/MyDrive/Codes/Xgboost_fasa/R_FASA_Binned_spectra_all.mat")
spectraTissue = Mat['spectraTissue']
spectraTissue = savgol_filter(spectraTissue, 5, 4, 0, axis=0)
NHG = [int(x) for x in Mat['label'].T]
y = LabelEncoder().fit_transform(NHG)
print("Size of spectraTissue:", spectraTissue.shape)



# Initial Class Distribution
print("Initial class distribution:")
display_class_distribution(y)

# Model Training and Evaluation
nsplits = 5
skf = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=0)

# Store performance metrics for each fold
f1_train_scores = []
f1_test_scores = []
fold_details = []

print("Starting cross-validation with SMOTE...")
for fold, (train_index, test_index) in enumerate(skf.split(spectraTissue, y), start=1):
    X_train, X_test = spectraTissue[train_index], spectraTissue[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Compute sample weights for the current fold
    sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)

    # Create an XGBClassifier instance
    xgb_clf = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', use_label_encoder=False)

    # Create a pipeline with SMOTE and the XGBoost classifier
    pipeline = ImbPipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', xgb_clf)
    ])

    # Define the parameter space for GridSearchCV
    paramspace = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__subsample': [0.5, 0.7, 0.9],
        'classifier__colsample_bytree': [0.5, 0.7, 0.9],
        # Do not include 'classifier__scale_pos_weight' here since we're passing it in fit method
    }

    # Compute sample weights for the current fold
    sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)

    # Configure and run GridSearchCV
    grid = GridSearchCV(
        pipeline,
        param_grid=paramspace,
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1_macro',
        verbose=2
    )

    # Set up GridSearchCV
    grid = GridSearchCV(pipeline, param_grid=paramspace, scoring='f1_macro', cv=3, verbose=2)

    # Fit the GridSearchCV
    grid.fit(X_train, y_train, classifier__sample_weight=sample_weight)
    print(X_train.shape, y_train.shape, sample_weight.shape)

   

    fit_params = {'classifier__sample_weight': sample_weight}
    grid.fit(X_train, y_train, **fit_params)
    # Get the best estimator
    best_pipeline = grid.best_estimator_

    # Make predictions
    preds_train = best_pipeline.predict(X_train)
    preds_test = best_pipeline.predict(X_test)
    preds_proba_test = best_pipeline.predict_proba(X_test)

    # Evaluate and store metrics for each fold
    f1_train = f1_score(y_train, preds_train, average='macro')
    f1_test = f1_score(y_test, preds_test, average='macro')
    f1_train_scores.append(f1_train)
    f1_test_scores.append(f1_test)
    fold_details.append({
        'Fold': fold,
        'Train F1 Score': f1_train,
        'Test F1 Score': f1_test,
        'Best Params': grid.best_params_
    })

    # Evaluate model performance with detailed report and visualizations
    evaluate_model_performance(y_test, preds_test, preds_proba_test, best_pipeline.named_steps['classifier'])

# Calculate and display average class distribution after SMOTE across folds
print("\nAverage class distribution after SMOTE across folds:")
avg_class_distribution = np.mean([np.bincount(y_train) for _, y_train in skf.split(spectraTissue, y)], axis=0)
distribution_df = create_class_distribution_df(avg_class_distribution.astype(int))
print(distribution_df)

# Results Visualization
# Plot F1 Score Across Folds
plt.figure(figsize=(10, 5))
folds = list(range(1, nsplits + 1))
plt.plot(folds, f1_train_scores, label='Train F1 Score', marker='o')
plt.plot(folds, f1_test_scores, label='Test F1 Score', marker='o')
plt.title('F1 Score Across Folds')
plt.xlabel('Fold')
plt.ylabel('F1 Score')
plt.legend()
plt.show()

# End timing and print execution time
toc = time.perf_counter()
print(f"\nModel training and evaluation complete. Execution time: {toc - tic:.2f} seconds")

# Save the fold details to a CSV file for later analysis
df_fold_details = pd.DataFrame(fold_details)
df_fold_details.to_csv("/content/drive/MyDrive/Codes/Xgboost_fasa/fold_details.csv", index=False)

Error: ValueError                                Traceback (most recent call last)
<ipython-input-10-5ae7e305ec5b> in <cell line: 120>()
    160 
    161     # Fit the GridSearchCV
--> 162     grid.fit(X_train, y_train, classifier__sample_weight=sample_weight)
    163     print(X_train.shape, y_train.shape, sample_weight.shape)
    164 

3 frames
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py in _warn_or_raise_about_fit_failures(results, error_score)
    365                 f"Below are more details about the failures:\n{fit_errors_summary}"
    366             )
--> 367             raise ValueError(all_fits_failed_message)
    368 
    369         else:

ValueError: 
All the 729 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Class distribution Class distribution

Dataset: FTIR output (.mat file) spectral data. I tried smote, borderline smote, tried svmsmote + easyensemble but still didn't work. The f1 score is very low and i suspect that is becasue the class 0 is not evenly weighed in the xgboost algorithm.

Any advice or modifications will be highly appreciated.


Solution

  • Using and even tuning the scale_pos_weight parameter can be very effective for imbalanced classification problems, over resampling methods.

    You can even use SMOTE and add an additional bias via scale_pos_weight, although tuning may be required to find the right balance that offers a benefit over using scale_pos_weight alone, if at all.

    This example may help in tuning scale_pos_weight: https://xgboosting.com/xgboost-tune-scale_pos_weight-parameter/