I am using the random forest classification from sklearn and I am getting decent results in everything except the confusion matrix here are the codes and results
The label distribution for the training and testing
This is not what i was expecting especially since that the amount of the training was only 1/3 of what it should have been i have 677k in the training dataset, but in the confusion matrix it's only doing all the label 0's.
The model:
import time
# Record the starting time
start_time = time.time()
# Random Forest classifier
rf = RandomForestClassifier()
# Define the parameter grid
rf_param_grid = {'n_estimators': [45], 'criterion': ['entropy'], 'max_depth': [30]}
# Grid search
rf_cv = GridSearchCV(rf, rf_param_grid, cv=7)
rf_cv.fit(X_train, y_train)
# Record the ending time
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time
# Print the results
print("Best Score:", rf_cv.best_score_)
print("Best Parameters:", rf_cv.best_params_)
print("Elapsed Time:", elapsed_time, "seconds")
I got good results for each class here over 98% for each:
# Make predictions on the training data
y_train_pred = rf_cv.predict(X_train)
# Compute accuracy
accuracy = accuracy_score(y_train, y_train_pred)
# Compute precision, recall, and F1-score for each class
precision = precision_score(y_train, y_train_pred, average=None)
recall = recall_score(y_train, y_train_pred, average=None)
f1 = f1_score(y_train, y_train_pred, average=None)
# Compute macro-averaged precision, recall, and F1-score
macro_precision = precision_score(y_train, y_train_pred, average='macro')
macro_recall = recall_score(y_train, y_train_pred, average='macro')
macro_f1 = f1_score(y_train, y_train_pred, average='macro')
# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision (Class 0, 1, 2):", precision)
print("Recall (Class 0, 1, 2):", recall)
print("F1-score (Class 0, 1, 2):", f1)
print("Macro-averaged Precision:", macro_precision)
print("Macro-averaged Recall:", macro_recall)
print("Macro-averaged F1-score:", macro_f1)
The confusion matrix where it doesn't show all the labels except class 0
# Generate the confusion matrix
conf_matrix = confusion_matrix(y_train, y_train_pred)
# Define class labels
class_labels = ['Class 0', 'Class 1', 'Class 2']
# Visualize the confusion matrix with class labels
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
It seems like the trouble is with matplotlib/seaborn. (I can't reproduce it; maybe you need to give us a reproducible example with the exact code you wrote and your dataset.)
Instead of using the plot, you can display/print the confusion matrix as a DataFrame.
import pandas as pd
from sklearn.metrics import confusion_matrix
def get_confusion_matrix_df(classifier, X, y):
"""Return the confusion matrix as a DataFrame."""
labels = classifier.classes_
columns_labels = pd.MultiIndex.from_product([["Predicted"], labels])
index_labels = pd.MultiIndex.from_product([["Actual"], labels])
prediction = classifier.predict(X)
matrix = confusion_matrix(y, prediction, labels=labels)
return pd.DataFrame(matrix, columns=columns_labels, index=index_labels)
get_confusion_matrix_df(rf_cv, X_train, y_train)
Example:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
X, y = load_iris(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
get_confusion_matrix_df(model, X_test, y_test)
Result: