attached is the code for xgboost on ftir data with smote and smote_weights. the results based on smote is attached as image. From the confusion matrix, i understood that even after applying smote, class 0 is not being utilized in any fold. So i try to include sample weights and end up with following error.
with smote
import time
import numpy as np
import as sio
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score, confusion_matrix
from scipy.signal import savgol_filter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns
def evaluate_model_performance(y_true, y_pred, model):
Evaluate and display the performance of the trained model.
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
if hasattr(model, 'feature_importances_') and model.feature_importances_.any():
plt.figure(figsize=(10, 8))
xgb.plot_importance(model, max_num_features=10)
plt.title("Feature importances")
def display_class_distribution(y):
Display the distribution of classes in the label array.
unique, counts = np.unique(y, return_counts=True)
distribution = pd.DataFrame({'Class': unique, 'Count': counts})
def create_class_distribution_df(y):
unique, counts = np.unique(y, return_counts=True)
distribution_df = pd.DataFrame({'Class': unique, 'Count': counts})
distribution_df['Percentage'] = distribution_df['Count'] / distribution_df['Count'].sum() * 100
# Filter out classes with zero counts
distribution_df = distribution_df[distribution_df['Count'] > 0]
return distribution_df
# Start timing
tic = time.perf_counter()
# Data Loading and Preprocessing
print("Loading data...")
Mat = sio.loadmat("/content/drive/MyDrive/Codes/Xgboost_fasa/R_FASA_Binned_spectra_all.mat")
spectraTissue = Mat['spectraTissue']
spectraTissue = savgol_filter(spectraTissue, 5, 4, 0, axis=0)
NHG = [int(x) for x in Mat['label'].T]
y = LabelEncoder().fit_transform(NHG)
patientNo = Mat['patientNo'].T
print("Size of spectraTissue:", spectraTissue.shape)
# Initial Class Distribution
print("Initial class distribution:")
# Model Training and Evaluation
train_perf, test_perf, fold_details, f1_train_scores, f1_test_scores, hyperparameters, class_distribution = [], [], [], [], [], [], []
nsplits = 5
skf = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=0)
print("Starting cross-validation with SMOTE...")
for fold, (train_index, test_index) in enumerate(skf.split(spectraTissue, y), start=1):
X_train, X_test, y_train, y_test = spectraTissue[train_index], spectraTissue[test_index], y[train_index], y[test_index]
# Pipeline for GridSearchCV with SMOTE
pipeline = ImbPipeline([
('smote', SMOTE(random_state=42)),
('classifier', xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss'))
paramspace = {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [3, 5, 7],
'classifier__learning_rate': [0.01, 0.1, 0.2],
'classifier__subsample': [0.5, 0.7, 0.9],
'classifier__colsample_bytree': [0.5, 0.7, 0.9],
grid = GridSearchCV(pipeline, param_grid=paramspace, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), scoring='f1_macro', verbose=2)
mod =, y_train)
# Apply SMOTE to the training data
X_resampled, y_resampled = mod.best_estimator_.named_steps['smote'].fit_resample(X_train, y_train)
# Calculate the class distribution after SMOTE
resampled_distribution_df = create_class_distribution_df(y_resampled)
print(f"\nClass distribution after SMOTE in fold {fold}:")
print("Unique classes after SMOTE:", np.unique(y_resampled))
# Predict and evaluate
preds_train = mod.predict(X_train)
preds_test = mod.predict(X_test)
train_f1, test_f1 = f1_score(y_train, preds_train, average='macro'), f1_score(y_test, preds_test, average='macro')
# Store fold details
'Fold': fold,
'Train Size': len(X_train),
'Test Size': len(X_test),
'Best Params': mod.best_params_,
'Train F1 Score': train_f1,
'Test F1 Score': test_f1
# Output performance evaluation for the fold
evaluate_model_performance(y_test, preds_test, mod.best_estimator_)
# Calculate the average class distribution after SMOTE
print("\nAverage class distribution after SMOTE across folds:")
avg_class_distribution = np.mean(class_distribution, axis=0)
distribution_df = create_class_distribution_df(avg_class_distribution.astype(int))
# Display the class distribution
# Save the class distribution to a CSV file
distribution_csv_path = "average_class_distribution_after_smote.csv"
distribution_df.to_csv(distribution_csv_path, index=False)
print(f"Class distribution after SMOTE saved to {distribution_csv_path}")
# Results Visualization
# F1 Score Across Folds Plot
plt.figure(figsize=(10, 5))
folds = list(range(1, nsplits + 1))
plt.plot(folds, f1_train_scores, label='Train F1 Score', marker='o')
plt.plot(folds, f1_test_scores, label='Test F1 Score', marker='o')
plt.title('F1 Score Across Folds')
plt.ylabel('F1 Score')
# Save the overall performance and fold details
df_fold_details = pd.DataFrame(fold_details)
df_performance = pd.DataFrame({
'Fold': folds,
'Train Performance': train_perf,
'Test Performance': test_perf,
'Hyperparameters': [str(params) for params in hyperparameters]
df_fold_details.to_csv("R_Fasa_XGB_fold_details_with_smote.csv", index=False)
df_performance.to_csv("R_Fasa_XGB_performance_summary_with_smote.csv", index=False)
print("Model training and evaluation complete.")
toc = time.perf_counter()
print(f"Execution time: {toc - tic:.2f} seconds")
with smote - ROC _ Weights
import time
import numpy as np
import as sio
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, auc
from scipy.signal import savgol_filter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
# Function to evaluate model performance
def evaluate_model_performance(y_true, y_pred, y_proba, model):
Evaluate and display the performance of the trained model.
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
# Plot confusion matrix
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
# Plot feature importances if available
if hasattr(model, 'feature_importances_') and model.feature_importances_.any():
plt.figure(figsize=(10, 8))
xgb.plot_importance(model, max_num_features=10)
plt.title("Feature importances")
# ROC-AUC calculations and plotting
# Compute ROC curve and ROC area for each class
n_classes = y_proba.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Plot the ROC curve for the micro-average
plt.figure(figsize=(10, 8))
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
# Plot ROC curve for each class
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_true == i, y_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
# Function to display class distribution
def display_class_distribution(y):
Display the distribution of classes in the label array.
unique, counts = np.unique(y, return_counts=True)
distribution = pd.DataFrame({'Class': unique, 'Count': counts})
distribution['Percentage'] = distribution['Count'] / sum(counts) * 100
# Start timing
tic = time.perf_counter()
# Data Loading and Preprocessing
print("Loading data...")
Mat = sio.loadmat("/content/drive/MyDrive/Codes/Xgboost_fasa/R_FASA_Binned_spectra_all.mat")
spectraTissue = Mat['spectraTissue']
spectraTissue = savgol_filter(spectraTissue, 5, 4, 0, axis=0)
NHG = [int(x) for x in Mat['label'].T]
y = LabelEncoder().fit_transform(NHG)
print("Size of spectraTissue:", spectraTissue.shape)
# Initial Class Distribution
print("Initial class distribution:")
# Model Training and Evaluation
nsplits = 5
skf = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=0)
# Store performance metrics for each fold
f1_train_scores = []
f1_test_scores = []
fold_details = []
print("Starting cross-validation with SMOTE...")
for fold, (train_index, test_index) in enumerate(skf.split(spectraTissue, y), start=1):
X_train, X_test = spectraTissue[train_index], spectraTissue[test_index]
y_train, y_test = y[train_index], y[test_index]
# Compute sample weights for the current fold
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
# Create an XGBClassifier instance
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', use_label_encoder=False)
# Create a pipeline with SMOTE and the XGBoost classifier
pipeline = ImbPipeline([
('smote', SMOTE(random_state=42)),
('classifier', xgb_clf)
# Define the parameter space for GridSearchCV
paramspace = {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [3, 5, 7],
'classifier__learning_rate': [0.01, 0.1, 0.2],
'classifier__subsample': [0.5, 0.7, 0.9],
'classifier__colsample_bytree': [0.5, 0.7, 0.9],
# Do not include 'classifier__scale_pos_weight' here since we're passing it in fit method
# Compute sample weights for the current fold
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
# Configure and run GridSearchCV
grid = GridSearchCV(
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
# Set up GridSearchCV
grid = GridSearchCV(pipeline, param_grid=paramspace, scoring='f1_macro', cv=3, verbose=2)
# Fit the GridSearchCV, y_train, classifier__sample_weight=sample_weight)
print(X_train.shape, y_train.shape, sample_weight.shape)
fit_params = {'classifier__sample_weight': sample_weight}, y_train, **fit_params)
# Get the best estimator
best_pipeline = grid.best_estimator_
# Make predictions
preds_train = best_pipeline.predict(X_train)
preds_test = best_pipeline.predict(X_test)
preds_proba_test = best_pipeline.predict_proba(X_test)
# Evaluate and store metrics for each fold
f1_train = f1_score(y_train, preds_train, average='macro')
f1_test = f1_score(y_test, preds_test, average='macro')
'Fold': fold,
'Train F1 Score': f1_train,
'Test F1 Score': f1_test,
'Best Params': grid.best_params_
# Evaluate model performance with detailed report and visualizations
evaluate_model_performance(y_test, preds_test, preds_proba_test, best_pipeline.named_steps['classifier'])
# Calculate and display average class distribution after SMOTE across folds
print("\nAverage class distribution after SMOTE across folds:")
avg_class_distribution = np.mean([np.bincount(y_train) for _, y_train in skf.split(spectraTissue, y)], axis=0)
distribution_df = create_class_distribution_df(avg_class_distribution.astype(int))
# Results Visualization
# Plot F1 Score Across Folds
plt.figure(figsize=(10, 5))
folds = list(range(1, nsplits + 1))
plt.plot(folds, f1_train_scores, label='Train F1 Score', marker='o')
plt.plot(folds, f1_test_scores, label='Test F1 Score', marker='o')
plt.title('F1 Score Across Folds')
plt.ylabel('F1 Score')
# End timing and print execution time
toc = time.perf_counter()
print(f"\nModel training and evaluation complete. Execution time: {toc - tic:.2f} seconds")
# Save the fold details to a CSV file for later analysis
df_fold_details = pd.DataFrame(fold_details)
df_fold_details.to_csv("/content/drive/MyDrive/Codes/Xgboost_fasa/fold_details.csv", index=False)
Error: ValueError Traceback (most recent call last)
<ipython-input-10-5ae7e305ec5b> in <cell line: 120>()
161 # Fit the GridSearchCV
--> 162, y_train, classifier__sample_weight=sample_weight)
163 print(X_train.shape, y_train.shape, sample_weight.shape)
3 frames
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/ in _warn_or_raise_about_fit_failures(results, error_score)
365 f"Below are more details about the failures:\n{fit_errors_summary}"
366 )
--> 367 raise ValueError(all_fits_failed_message)
369 else:
All the 729 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.
Class distribution
Dataset: FTIR output (.mat file) spectral data. I tried smote, borderline smote, tried svmsmote + easyensemble but still didn't work. The f1 score is very low and i suspect that is becasue the class 0 is not evenly weighed in the xgboost algorithm.
Any advice or modifications will be highly appreciated.
Using and even tuning the scale_pos_weight
parameter can be very effective for imbalanced classification problems, over resampling methods.
You can even use SMOTE and add an additional bias via scale_pos_weight
, although tuning may be required to find the right balance that offers a benefit over using scale_pos_weight
alone, if at all.
This example may help in tuning scale_pos_weight