python pandas scikit-learn imbalanced-data

Brier Skill Score returns NaN in cross_val_score with imbalanced dataset

I’m trying to evaluate classification models on a highly imbalanced fraud dataset using the Brier Skill Score (BSS) as the evaluation metric.

The dataset has ~2133 rows and the target Fraud_Flag is imbalanced:

Counter({0: 2067, 1: 66})

I implemented the Brier Skill Score like this:

def brier_skill_score(y_true, y_prob):
    """
    Compute the Brier Skill Score (BSS).
    - 0.0 = no skill (same as baseline)
    - >0.0 = better than baseline
    - <0.0 = worse than baseline
    """
    pos_prob = np.count_nonzero(y_true) / len(y_true)  # positive class proportion
    ref_probs = [pos_prob for _ in range(len(y_true))]

    # baseline (predict always prior prob)
    bs_ref = brier_score_loss(y_true, ref_probs)
    # model score
    bs_model = brier_score_loss(y_true, y_prob)

    if bs_ref == 0:  # avoid division by zero
        return 0.0

    return 1.0 - (bs_model / bs_ref)

Then I evaluated different models with cross_val_score:

baseline = DummyClassifier(strategy="prior")
baseline_model = build_pipeline(X, baseline)
print("\nBaseline (DummyClassifier):")
evaluate(X, y, baseline_model)

# Logistic Regression
lr_model = build_pipeline(X, LogisticRegression(max_iter=1000))
print("\nLogistic Regression:")
evaluate(X, y, lr_model)

# Random Forest
rf_model = build_pipeline(X, RandomForestClassifier(random_state=42))
print("\nRandom Forest:")
evaluate(X, y, rf_model)

# Gradient Boosting
gb_model = build_pipeline(X, GradientBoostingClassifier(random_state=42))
print("\nGradient Boosting:")
evaluate(X, y, gb_model)

But all models return NaN BSS values:

Baseline (DummyClassifier):
Mean BSS: nan (nan)
Logistic Regression:
Mean BSS: nan (nan)
Random Forest:
Mean BSS: nan (nan)
Gradient Boosting:
Mean BSS: nan (nan)

Complete code:

import pandas as pd
import numpy as np
from numpy import mean, std
from collections import Counter

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import brier_score_loss, make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


# ==================================================
# 1. Brier Skill Score
# ==================================================
def brier_skill_score(y_true, y_prob):
    pos_prob = np.count_nonzero(y_true) / len(y_true)
    ref_probs = [pos_prob for _ in range(len(y_true))]

    bs_ref = brier_score_loss(y_true, ref_probs)
    bs_model = brier_score_loss(y_true, y_prob)

    if bs_ref == 0:  # evitar división por cero
        return 0.0

    return 1.0 - (bs_model / bs_ref)


# ==================================================
# 2. Evaluación del modelo
# ==================================================
def evaluate(X, y, model, n_splits=10, n_repeats=3):
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
    metric = make_scorer(brier_skill_score, needs_proba=True)
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
    print("Mean BSS: %.3f (%.3f)" % (mean(scores), std(scores)))
    return scores


# ==================================================
# 3. Pipeline de preprocesamiento + modelo
# ==================================================
def build_pipeline(X, model=None):
    num_cols = X.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X.select_dtypes(include=["object", "category"]).columns

    num_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ])

    cat_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    column_tr = ColumnTransformer(
        transformers=[
            ("num", num_pipeline, num_cols),
            ("cat", cat_pipeline, cat_cols)
        ]
    )

    if model is None:
        model = RandomForestClassifier(random_state=42)

    pipeline = ImbPipeline(steps=[
        ("preprocessor", column_tr),
        ("model", model)
    ])
    return pipeline


# ==================================================
# 4. Ejemplo de uso con tu dataset de credit card
# ==================================================
# df = pd.read_csv("credit_card.csv")
# X = df.drop("Fraud_Flag", axis=1)
# y = LabelEncoder().fit_transform(df["Fraud_Flag"])

print(X.shape, y.shape, Counter(y))

baseline = DummyClassifier(strategy="prior")
baseline_model = build_pipeline(X, baseline)
print("\nBaseline (DummyClassifier):")
evaluate(X, y, baseline_model)

lr_model = build_pipeline(X, LogisticRegression(max_iter=1000))
print("\nLogistic Regression:")
evaluate(X, y, lr_model)

rf_model = build_pipeline(X, RandomForestClassifier(random_state=42))
print("\nRandom Forest:")
evaluate(X, y, rf_model)

gb_model = build_pipeline(X, GradientBoostingClassifier(random_state=42))
print("\nGradient Boosting:")
evaluate(X, y, gb_model)

Solution

I was getting NaN values when using Brier Skill Score with cross-validation.
The issue was in how I defined the scorer.

metric = make_scorer(brier_skill_score, needs_proba=True)

But I found that in recent versions of scikit-learn, needs_proba is deprecated and may cause unstable behavior.

✅ The fix was to explicitly tell scikit-learn to use predict_proba:

metric = make_scorer(brier_skill_score, response_method="predict_proba")

This solved the problem and I now get valid Brier Skill Score results instead of NaN.