I’m trying to evaluate classification models on a highly imbalanced fraud dataset using the Brier Skill Score (BSS) as the evaluation metric.
The dataset has ~2133 rows and the target Fraud_Flag is imbalanced:
Counter({0: 2067, 1: 66})
I implemented the Brier Skill Score like this:
def brier_skill_score(y_true, y_prob):
"""
Compute the Brier Skill Score (BSS).
- 0.0 = no skill (same as baseline)
- >0.0 = better than baseline
- <0.0 = worse than baseline
"""
pos_prob = np.count_nonzero(y_true) / len(y_true) # positive class proportion
ref_probs = [pos_prob for _ in range(len(y_true))]
# baseline (predict always prior prob)
bs_ref = brier_score_loss(y_true, ref_probs)
# model score
bs_model = brier_score_loss(y_true, y_prob)
if bs_ref == 0: # avoid division by zero
return 0.0
return 1.0 - (bs_model / bs_ref)
baseline = DummyClassifier(strategy="prior")
baseline_model = build_pipeline(X, baseline)
print("\nBaseline (DummyClassifier):")
evaluate(X, y, baseline_model)
# Logistic Regression
lr_model = build_pipeline(X, LogisticRegression(max_iter=1000))
print("\nLogistic Regression:")
evaluate(X, y, lr_model)
# Random Forest
rf_model = build_pipeline(X, RandomForestClassifier(random_state=42))
print("\nRandom Forest:")
evaluate(X, y, rf_model)
# Gradient Boosting
gb_model = build_pipeline(X, GradientBoostingClassifier(random_state=42))
print("\nGradient Boosting:")
evaluate(X, y, gb_model)
Baseline (DummyClassifier):
Mean BSS: nan (nan)
Logistic Regression:
Mean BSS: nan (nan)
Random Forest:
Mean BSS: nan (nan)
Gradient Boosting:
Mean BSS: nan (nan)
Complete code:
import pandas as pd
import numpy as np
from numpy import mean, std
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import brier_score_loss, make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# ==================================================
# 1. Brier Skill Score
# ==================================================
def brier_skill_score(y_true, y_prob):
pos_prob = np.count_nonzero(y_true) / len(y_true)
ref_probs = [pos_prob for _ in range(len(y_true))]
bs_ref = brier_score_loss(y_true, ref_probs)
bs_model = brier_score_loss(y_true, y_prob)
if bs_ref == 0: # evitar división por cero
return 0.0
return 1.0 - (bs_model / bs_ref)
# ==================================================
# 2. Evaluación del modelo
# ==================================================
def evaluate(X, y, model, n_splits=10, n_repeats=3):
cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
metric = make_scorer(brier_skill_score, needs_proba=True)
scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
print("Mean BSS: %.3f (%.3f)" % (mean(scores), std(scores)))
return scores
# ==================================================
# 3. Pipeline de preprocesamiento + modelo
# ==================================================
def build_pipeline(X, model=None):
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns
num_pipeline = Pipeline(steps=[
("imputer", SimpleImputer(strategy="mean")),
("scaler", StandardScaler())
])
cat_pipeline = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore"))
])
column_tr = ColumnTransformer(
transformers=[
("num", num_pipeline, num_cols),
("cat", cat_pipeline, cat_cols)
]
)
if model is None:
model = RandomForestClassifier(random_state=42)
pipeline = ImbPipeline(steps=[
("preprocessor", column_tr),
("model", model)
])
return pipeline
# ==================================================
# 4. Ejemplo de uso con tu dataset de credit card
# ==================================================
# df = pd.read_csv("credit_card.csv")
# X = df.drop("Fraud_Flag", axis=1)
# y = LabelEncoder().fit_transform(df["Fraud_Flag"])
print(X.shape, y.shape, Counter(y))
baseline = DummyClassifier(strategy="prior")
baseline_model = build_pipeline(X, baseline)
print("\nBaseline (DummyClassifier):")
evaluate(X, y, baseline_model)
lr_model = build_pipeline(X, LogisticRegression(max_iter=1000))
print("\nLogistic Regression:")
evaluate(X, y, lr_model)
rf_model = build_pipeline(X, RandomForestClassifier(random_state=42))
print("\nRandom Forest:")
evaluate(X, y, rf_model)
gb_model = build_pipeline(X, GradientBoostingClassifier(random_state=42))
print("\nGradient Boosting:")
evaluate(X, y, gb_model)
I was getting NaN
values when using Brier Skill Score with cross-validation.
The issue was in how I defined the scorer.
metric = make_scorer(brier_skill_score, needs_proba=True)
But I found that in recent versions of scikit-learn, needs_proba
is deprecated and may cause unstable behavior.
✅ The fix was to explicitly tell scikit-learn to use predict_proba
:
metric = make_scorer(brier_skill_score, response_method="predict_proba")
This solved the problem and I now get valid Brier Skill Score results instead of NaN.