attached is the code for xgboost on ftir data with smote and smote_weights. the results based on smote is attached as image. From the confusion matrix, i understood that even after applying smote, class 0 is not being utilized in any fold. So i try to include sample weights and end up with following error.
'''
with smote
'''
import time
import numpy as np
import scipy.io as sio
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score, confusion_matrix
from scipy.signal import savgol_filter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns
def evaluate_model_performance(y_true, y_pred, model):
"""
Evaluate and display the performance of the trained model.
"""
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
if hasattr(model, 'feature_importances_') and model.feature_importances_.any():
plt.figure(figsize=(10, 8))
xgb.plot_importance(model, max_num_features=10)
plt.title("Feature importances")
plt.show()
def display_class_distribution(y):
"""
Display the distribution of classes in the label array.
"""
unique, counts = np.unique(y, return_counts=True)
distribution = pd.DataFrame({'Class': unique, 'Count': counts})
print(distribution)
def create_class_distribution_df(y):
unique, counts = np.unique(y, return_counts=True)
distribution_df = pd.DataFrame({'Class': unique, 'Count': counts})
distribution_df['Percentage'] = distribution_df['Count'] / distribution_df['Count'].sum() * 100
# Filter out classes with zero counts
distribution_df = distribution_df[distribution_df['Count'] > 0]
return distribution_df
# Start timing
tic = time.perf_counter()
# Data Loading and Preprocessing
print("Loading data...")
Mat = sio.loadmat("/content/drive/MyDrive/Codes/Xgboost_fasa/R_FASA_Binned_spectra_all.mat")
spectraTissue = Mat['spectraTissue']
spectraTissue = savgol_filter(spectraTissue, 5, 4, 0, axis=0)
NHG = [int(x) for x in Mat['label'].T]
y = LabelEncoder().fit_transform(NHG)
patientNo = Mat['patientNo'].T
print("Size of spectraTissue:", spectraTissue.shape)
# Initial Class Distribution
print("Initial class distribution:")
display_class_distribution(y)
# Model Training and Evaluation
train_perf, test_perf, fold_details, f1_train_scores, f1_test_scores, hyperparameters, class_distribution = [], [], [], [], [], [], []
nsplits = 5
skf = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=0)
print("Starting cross-validation with SMOTE...")
for fold, (train_index, test_index) in enumerate(skf.split(spectraTissue, y), start=1):
X_train, X_test, y_train, y_test = spectraTissue[train_index], spectraTissue[test_index], y[train_index], y[test_index]
# Pipeline for GridSearchCV with SMOTE
pipeline = ImbPipeline([
('smote', SMOTE(random_state=42)),
('classifier', xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss'))
])
paramspace = {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [3, 5, 7],
'classifier__learning_rate': [0.01, 0.1, 0.2],
'classifier__subsample': [0.5, 0.7, 0.9],
'classifier__colsample_bytree': [0.5, 0.7, 0.9],
}
grid = GridSearchCV(pipeline, param_grid=paramspace, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), scoring='f1_macro', verbose=2)
mod = grid.fit(X_train, y_train)
# Apply SMOTE to the training data
X_resampled, y_resampled = mod.best_estimator_.named_steps['smote'].fit_resample(X_train, y_train)
# Calculate the class distribution after SMOTE
resampled_distribution_df = create_class_distribution_df(y_resampled)
print(f"\nClass distribution after SMOTE in fold {fold}:")
print(resampled_distribution_df)
print("Unique classes after SMOTE:", np.unique(y_resampled))
# Predict and evaluate
preds_train = mod.predict(X_train)
preds_test = mod.predict(X_test)
train_f1, test_f1 = f1_score(y_train, preds_train, average='macro'), f1_score(y_test, preds_test, average='macro')
f1_train_scores.append(train_f1)
f1_test_scores.append(test_f1)
train_perf.append(train_f1)
test_perf.append(test_f1)
hyperparameters.append(mod.best_params_)
class_distribution.append(np.bincount(y_train))
# Store fold details
fold_details.append({
'Fold': fold,
'Train Size': len(X_train),
'Test Size': len(X_test),
'Best Params': mod.best_params_,
'Train F1 Score': train_f1,
'Test F1 Score': test_f1
})
# Output performance evaluation for the fold
evaluate_model_performance(y_test, preds_test, mod.best_estimator_)
# Calculate the average class distribution after SMOTE
print("\nAverage class distribution after SMOTE across folds:")
avg_class_distribution = np.mean(class_distribution, axis=0)
distribution_df = create_class_distribution_df(avg_class_distribution.astype(int))
# Display the class distribution
print(distribution_df)
# Save the class distribution to a CSV file
distribution_csv_path = "average_class_distribution_after_smote.csv"
distribution_df.to_csv(distribution_csv_path, index=False)
print(f"Class distribution after SMOTE saved to {distribution_csv_path}")
# Results Visualization
# F1 Score Across Folds Plot
plt.figure(figsize=(10, 5))
folds = list(range(1, nsplits + 1))
plt.plot(folds, f1_train_scores, label='Train F1 Score', marker='o')
plt.plot(folds, f1_test_scores, label='Test F1 Score', marker='o')
plt.title('F1 Score Across Folds')
plt.xlabel('Fold')
plt.ylabel('F1 Score')
plt.legend()
plt.show()
# Save the overall performance and fold details
df_fold_details = pd.DataFrame(fold_details)
df_performance = pd.DataFrame({
'Fold': folds,
'Train Performance': train_perf,
'Test Performance': test_perf,
'Hyperparameters': [str(params) for params in hyperparameters]
})
df_fold_details.to_csv("R_Fasa_XGB_fold_details_with_smote.csv", index=False)
df_performance.to_csv("R_Fasa_XGB_performance_summary_with_smote.csv", index=False)
print("Model training and evaluation complete.")
toc = time.perf_counter()
print(f"Execution time: {toc - tic:.2f} seconds")
'''
with smote - ROC _ Weights
'''
import time
import numpy as np
import scipy.io as sio
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, auc
from scipy.signal import savgol_filter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
# Function to evaluate model performance
def evaluate_model_performance(y_true, y_pred, y_proba, model):
"""
Evaluate and display the performance of the trained model.
"""
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
# Plot confusion matrix
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
# Plot feature importances if available
if hasattr(model, 'feature_importances_') and model.feature_importances_.any():
plt.figure(figsize=(10, 8))
xgb.plot_importance(model, max_num_features=10)
plt.title("Feature importances")
plt.show()
# ROC-AUC calculations and plotting
# Compute ROC curve and ROC area for each class
n_classes = y_proba.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_true.ravel(), y_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Plot the ROC curve for the micro-average
plt.figure(figsize=(10, 8))
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]))
# Plot ROC curve for each class
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_true == i, y_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()
# Function to display class distribution
def display_class_distribution(y):
"""
Display the distribution of classes in the label array.
"""
unique, counts = np.unique(y, return_counts=True)
distribution = pd.DataFrame({'Class': unique, 'Count': counts})
distribution['Percentage'] = distribution['Count'] / sum(counts) * 100
print(distribution)
# Start timing
tic = time.perf_counter()
# Data Loading and Preprocessing
print("Loading data...")
Mat = sio.loadmat("/content/drive/MyDrive/Codes/Xgboost_fasa/R_FASA_Binned_spectra_all.mat")
spectraTissue = Mat['spectraTissue']
spectraTissue = savgol_filter(spectraTissue, 5, 4, 0, axis=0)
NHG = [int(x) for x in Mat['label'].T]
y = LabelEncoder().fit_transform(NHG)
print("Size of spectraTissue:", spectraTissue.shape)
# Initial Class Distribution
print("Initial class distribution:")
display_class_distribution(y)
# Model Training and Evaluation
nsplits = 5
skf = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=0)
# Store performance metrics for each fold
f1_train_scores = []
f1_test_scores = []
fold_details = []
print("Starting cross-validation with SMOTE...")
for fold, (train_index, test_index) in enumerate(skf.split(spectraTissue, y), start=1):
X_train, X_test = spectraTissue[train_index], spectraTissue[test_index]
y_train, y_test = y[train_index], y[test_index]
# Compute sample weights for the current fold
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
# Create an XGBClassifier instance
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', use_label_encoder=False)
# Create a pipeline with SMOTE and the XGBoost classifier
pipeline = ImbPipeline([
('smote', SMOTE(random_state=42)),
('classifier', xgb_clf)
])
# Define the parameter space for GridSearchCV
paramspace = {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [3, 5, 7],
'classifier__learning_rate': [0.01, 0.1, 0.2],
'classifier__subsample': [0.5, 0.7, 0.9],
'classifier__colsample_bytree': [0.5, 0.7, 0.9],
# Do not include 'classifier__scale_pos_weight' here since we're passing it in fit method
}
# Compute sample weights for the current fold
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)
# Configure and run GridSearchCV
grid = GridSearchCV(
pipeline,
param_grid=paramspace,
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
scoring='f1_macro',
verbose=2
)
# Set up GridSearchCV
grid = GridSearchCV(pipeline, param_grid=paramspace, scoring='f1_macro', cv=3, verbose=2)
# Fit the GridSearchCV
grid.fit(X_train, y_train, classifier__sample_weight=sample_weight)
print(X_train.shape, y_train.shape, sample_weight.shape)
fit_params = {'classifier__sample_weight': sample_weight}
grid.fit(X_train, y_train, **fit_params)
# Get the best estimator
best_pipeline = grid.best_estimator_
# Make predictions
preds_train = best_pipeline.predict(X_train)
preds_test = best_pipeline.predict(X_test)
preds_proba_test = best_pipeline.predict_proba(X_test)
# Evaluate and store metrics for each fold
f1_train = f1_score(y_train, preds_train, average='macro')
f1_test = f1_score(y_test, preds_test, average='macro')
f1_train_scores.append(f1_train)
f1_test_scores.append(f1_test)
fold_details.append({
'Fold': fold,
'Train F1 Score': f1_train,
'Test F1 Score': f1_test,
'Best Params': grid.best_params_
})
# Evaluate model performance with detailed report and visualizations
evaluate_model_performance(y_test, preds_test, preds_proba_test, best_pipeline.named_steps['classifier'])
# Calculate and display average class distribution after SMOTE across folds
print("\nAverage class distribution after SMOTE across folds:")
avg_class_distribution = np.mean([np.bincount(y_train) for _, y_train in skf.split(spectraTissue, y)], axis=0)
distribution_df = create_class_distribution_df(avg_class_distribution.astype(int))
print(distribution_df)
# Results Visualization
# Plot F1 Score Across Folds
plt.figure(figsize=(10, 5))
folds = list(range(1, nsplits + 1))
plt.plot(folds, f1_train_scores, label='Train F1 Score', marker='o')
plt.plot(folds, f1_test_scores, label='Test F1 Score', marker='o')
plt.title('F1 Score Across Folds')
plt.xlabel('Fold')
plt.ylabel('F1 Score')
plt.legend()
plt.show()
# End timing and print execution time
toc = time.perf_counter()
print(f"\nModel training and evaluation complete. Execution time: {toc - tic:.2f} seconds")
# Save the fold details to a CSV file for later analysis
df_fold_details = pd.DataFrame(fold_details)
df_fold_details.to_csv("/content/drive/MyDrive/Codes/Xgboost_fasa/fold_details.csv", index=False)
Error: ValueError Traceback (most recent call last)
<ipython-input-10-5ae7e305ec5b> in <cell line: 120>()
160
161 # Fit the GridSearchCV
--> 162 grid.fit(X_train, y_train, classifier__sample_weight=sample_weight)
163 print(X_train.shape, y_train.shape, sample_weight.shape)
164
3 frames
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py in _warn_or_raise_about_fit_failures(results, error_score)
365 f"Below are more details about the failures:\n{fit_errors_summary}"
366 )
--> 367 raise ValueError(all_fits_failed_message)
368
369 else:
ValueError:
All the 729 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.
Class distribution
Dataset: FTIR output (.mat file) spectral data. I tried smote, borderline smote, tried svmsmote + easyensemble but still didn't work. The f1 score is very low and i suspect that is becasue the class 0 is not evenly weighed in the xgboost algorithm.
Any advice or modifications will be highly appreciated.
Using and even tuning the scale_pos_weight
parameter can be very effective for imbalanced classification problems, over resampling methods.
You can even use SMOTE and add an additional bias via scale_pos_weight
, although tuning may be required to find the right balance that offers a benefit over using scale_pos_weight
alone, if at all.
This example may help in tuning scale_pos_weight
: https://xgboosting.com/xgboost-tune-scale_pos_weight-parameter/