I am trying to calculate the optimism-adjusted AUC's via a double for-loop over n = n_eucl_sil ... n_gow_gap
and for three clustering algorithms. This should result in three print statements (see end). I am encountering two errors: first, "index out of bounds" for the K-medoids algorithm; second, print statements for each bootstrap iteration, instead of one (averaged) print. This is possibly due to wrong code indentation.
algorithms = {
'KMedoids': KMedoids,
'AgglomerativeClustering': AgglomerativeClustering,
'SpectralClustering': SpectralClustering
}
# Iterate through clustering algorithms and calculate AUC Apparent
for algorithm, labels in cluster_labels.items():
# Create DataFrame for logistic regression with cluster labels
df_orig = pd.DataFrame({"y_orig": np.ravel(y_dich), "labels_orig": np.ravel(labels)})
# Fit logistic regression model
model = sm.formula.glm(formula="y_orig ~ labels_orig", family=sm.families.Binomial(), data=df_orig).fit()
# Calculate AUC Apparent
y_pred_apparent = model.predict(df_orig['labels_orig'])
auc_apparent = roc_auc_score(df_orig['y_orig'], y_pred_apparent)
# Append results to auc_apparents DataFrame
auc_apparents = auc_apparents.append(
{
'method': algorithm,
'auc_apparent': auc_apparent
}, ignore_index=True)
# Display AUC Apparent values
print(auc_apparents)
print()
# Iterate through clustering algorithms and perform bootstrap iterations
for algorithm_name, algorithm in algorithms.items():
optimisms_eucl_sil, optimisms_eucl_gap, optimisms_gow_sil, optimisms_gow_gap = [], [], [], []
ls_orig_eucl_sil, ls_orig_eucl_gap, ls_orig_gow_sil, ls_orig_gow_gap = [pd.DataFrame() for _ in range(4)]
# Perform bootstrap iterations
for i in range(n_bootstraps):
sample = resample(orig_data, replace=True, n_samples=4509)
sample_gap = resample(orig_data_gap, replace=True, n_samples=4509)
y_boot = sample['y_orig']
n_eucl_sil = best_silhouette_euclidean(algorithm, sample.iloc[:, 1:], range_n_clusters)[0]
n_eucl_gap = GapStatEucl.fit_predict(algorithm, K, sample_gap.iloc[:,1:])
n_gow_sil = best_silhouette_gower(algorithm, gower.gower_matrix(sample.iloc[:, 1:]), range_n_clusters)[0]
n_gow_gap = GapStatGow.fit_predict(algorithm, K, pd.DataFrame(sample_gap.iloc[:, 1:]))
# (2) AUC Bootstrap
try:
if algorithm_name == 'KMedoids':
clusterer_eucl_sil = KMedoids(n_clusters=n_eucl_sil).fit(sample.iloc[:, 1:])
clusterer_eucl_gap = KMedoids(n_clusters=n_eucl_gap).fit(sample.iloc[:, 1:])
clusterer_gow_sil = KMedoids(n_clusters=n_gow_sil, metric='precomputed').fit(gower.gower_matrix(sample.iloc[:,1:]))
clusterer_gow_gap = KMedoids(n_clusters=n_gow_gap, metric='precomputed').fit(gower.gower_matrix(sample.iloc[:,1:]))
l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.predict(orig_data.iloc[:, 1:]))
l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.predict(orig_data.iloc[:, 1:]))
l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.predict(orig_data.iloc[:, 1:]))
l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.predict(orig_data.iloc[:, 1:]))
elif algorithm_name == 'AgglomerativeClustering':
clusterer_eucl_sil = AgglomerativeClustering(n_clusters=n_eucl_sil,linkage='average').fit(sample.iloc[:, 1:])
clusterer_eucl_gap = AgglomerativeClustering(n_clusters=n_eucl_gap,linkage='average').fit(sample.iloc[:, 1:])
clusterer_gow_sil = AgglomerativeClustering(n_clusters=n_gow_sil, metric='precomputed', linkage='average').fit(gower.gower_matrix(sample.iloc[:, 1:]))
clusterer_gow_gap = AgglomerativeClustering(n_clusters=n_gow_gap, metric='precomputed', linkage='average').fit(gower.gower_matrix(sample.iloc[:, 1:]))
l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.fit_predict(orig_data.iloc[:, 1:]))
l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.fit_predict(orig_data.iloc[:, 1:]))
l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.fit_predict(gower_matrix))
l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.fit_predict(gower_matrix))
elif algorithm_name == 'SpectralClustering':
clusterer_eucl_sil = SpectralClustering(n_clusters=n_eucl_sil, affinity='nearest_neighbors').fit(sample.iloc[:, 1:])
clusterer_eucl_gap = SpectralClustering(n_clusters=n_eucl_gap, affinity='nearest_neighbors').fit(sample.iloc[:, 1:])
clusterer_gow_sil = SpectralClustering(n_clusters=n_gow_sil, affinity='precomputed').fit(1 - gower.gower_matrix(sample.iloc[:,1:]))
clusterer_gow_gap = SpectralClustering(n_clusters=n_gow_gap, affinity='precomputed').fit(1 - gower.gower_matrix(sample.iloc[:, 1:]))
l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.fit_predict(orig_data.iloc[:, 1:]))
l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.fit_predict(orig_data.iloc[:, 1:]))
l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.fit_predict(gower.gower_matrix(sample.iloc[:,1:])))
l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.fit_predict(gower.gower_matrix(sample.iloc[:,1:])))
l_boot_eucl_sil = pd.DataFrame(clusterer_eucl_sil.labels_)
l_boot_eucl_gap = pd.DataFrame(clusterer_eucl_gap.labels_)
l_boot_gow_sil = pd.DataFrame(clusterer_gow_sil.labels_)
l_boot_gow_gap = pd.DataFrame(clusterer_gow_gap.labels_)
df_boot_eucl_sil = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_eucl_sil": np.ravel(l_boot_eucl_sil)})
df_boot_eucl_gap = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_eucl_gap": np.ravel(l_boot_eucl_gap)})
df_boot_gow_sil = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_gow_sil": np.ravel(l_boot_gow_sil)})
df_boot_gow_gap = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_gow_gap": np.ravel(l_boot_gow_gap)})
m_eucl_sil = sm.formula.glm(formula="y_boot ~ l_boot_eucl_sil", family=sm.families.Binomial(), data=df_boot_eucl_sil).fit()
m_eucl_gap = sm.formula.glm(formula="y_boot ~ l_boot_eucl_gap", family=sm.families.Binomial(), data=df_boot_eucl_gap).fit()
m_gow_sil = sm.formula.glm(formula="y_boot ~ l_boot_gow_sil", family=sm.families.Binomial(), data=df_boot_gow_sil).fit()
m_gow_gap = sm.formula.glm(formula="y_boot ~ l_boot_gow_gap", family=sm.families.Binomial(), data=df_boot_gow_gap).fit()
# Calculate AUC Bootstrap
models = [m_eucl_sil, m_eucl_gap, m_gow_sil, m_gow_gap]
label_dfs = [l_boot_eucl_sil, l_boot_eucl_gap, l_boot_gow_sil, l_boot_gow_gap]
auc_scores = []
# Iterate over models and label dataframes
for model, label_df in zip(models, label_dfs):
# Predictions
y_pred_boot = model.predict(label_df)
# AUC score
auc_boot = roc_auc_score(y_boot, y_pred_boot)
# Append to the list of AUC scores
auc_scores.append(auc_boot)
# Separate the AUC scores for each algorithm
auc_boot_eucl_sil, auc_boot_eucl_gap, auc_boot_gow_sil, auc_boot_gow_gap = [auc_scores[i] for i in range(4)]
# Append to the respective lists
aucs_boot_eucl_sil.append(auc_boot_eucl_sil)
aucs_boot_eucl_gap.append(auc_boot_eucl_gap)
aucs_boot_gow_sil.append(auc_boot_gow_sil)
aucs_boot_gow_gap.append(auc_boot_gow_gap)
# Calculate AUC Original
y_pred_orig_eucl_sil = m_eucl_sil.predict(pd.DataFrame(df_orig['labels_orig']))
y_pred_orig_eucl_gap = m_eucl_gap.predict(pd.DataFrame(df_orig['labels_orig']))
y_pred_orig_gow_sil = m_gow_sil.predict(pd.DataFrame(df_orig['labels_orig']))
y_pred_orig_gow_gap = m_gow_gap.predict(pd.DataFrame(df_orig['labels_orig']))
auc_orig_eucl_sil = roc_auc_score(df_orig['y_orig'], y_pred_orig_eucl_sil)
auc_orig_eucl_gap = roc_auc_score(df_orig['y_orig'], y_pred_orig_eucl_gap)
auc_orig_gow_sil = roc_auc_score(df_orig['y_orig'], y_pred_orig_gow_sil)
auc_orig_gow_gap = roc_auc_score(df_orig['y_orig'], y_pred_orig_gow_gap)
# Calculate optimism and append results
optimism_eucl_sil = auc_boot_eucl_sil - auc_orig_eucl_sil
optimism_eucl_gap = auc_boot_eucl_gap - auc_orig_eucl_gap
optimism_gow_sil = auc_boot_gow_sil - auc_orig_gow_sil
optimism_gow_gap = auc_boot_gow_gap - auc_orig_gow_gap
optimisms_eucl_sil.append(optimism_eucl_sil)
optimisms_eucl_gap.append(optimism_eucl_gap)
optimisms_gow_sil.append(optimism_gow_sil)
optimisms_gow_gap.append(optimism_gow_gap)
ls_orig_eucl_sil = pd.concat([ls_orig_eucl_sil, l_orig_eucl_sil.reset_index(drop=True)], axis=1)
ls_orig_eucl_gap = pd.concat([ls_orig_eucl_gap, l_orig_eucl_gap.reset_index(drop=True)], axis=1)
ls_orig_gow_sil = pd.concat([ls_orig_gow_sil, l_orig_gow_sil.reset_index(drop=True)], axis=1)
ls_orig_gow_gap = pd.concat([ls_orig_gow_gap, l_orig_gow_gap.reset_index(drop=True)], axis=1)
# Calculate average optimism
optimism_eucl_sil_avg = np.sum(optimisms_eucl_sil) / n_bootstraps
optimism_eucl_gap_avg = np.sum(optimisms_eucl_gap) / n_bootstraps
optimism_gow_sil_avg = np.sum(optimisms_gow_sil) / n_bootstraps
optimism_gow_gap_avg = np.sum(optimisms_gow_gap) / n_bootstraps
std_auc_boot_eucl_sil = np.std(aucs_boot_eucl_sil)
std_auc_boot_eucl_gap = np.std(aucs_boot_eucl_gap)
std_auc_boot_gow_sil = np.std(aucs_boot_gow_sil)
std_auc_boot_gow_gap = np.std(aucs_boot_gow_gap)
except Exception as e:
print("An error occurred:", e)
continue
print(f"Algorithm: {algorithm_name}\n"
f"Optimism Metrics:\n"
f" - Euclidean Silhouette: {optimism_eucl_sil_avg}\n"
f" - Euclidean Gap: {optimism_eucl_gap_avg}\n"
f" - Gower Silhouette: {optimism_gow_sil_avg}\n"
f" - Gower Gap: {optimism_gow_gap_avg}\n"
f"Standard Deviation of Bootstrap AUCs:\n"
f" - Euclidean Silhouette: {std_auc_boot_eucl_sil}\n"
f" - Euclidean Gap: {std_auc_boot_eucl_gap}\n"
f" - Gower Silhouette: {std_auc_boot_eucl_gap}\n"
f" - Gower Gap: {std_auc_boot_gow_gap}\n")
How can I fix this loop, such that I get one (averaged) print statement per algorithm, where the print statement contains the optimism-adjusted AUC estimates in accordance with the code procedure (the 4x repeated code lines, one for each n
)?
While immediate answer involves de-indenting, consider refactoring for DRY-er code using functions, conditional expressions, and various data objects such as tuples and dictionaries:
Functions
(using conditional logic/assignment)
def retrieve_n(method, algorithm, smpl, smpl_gap, range_n_clusters):
# DICT OF TUPLES CONTAINING FUNCTION AND ARGS
d = {
"eucl_sil": (best_silhouette_euclidean, (algorithm, smpl, range_n_clusters)),
"eucl_gap": (GapStatEucl.fit_predict, (algorithm, K, smpl_gap)),
"gow_sil": (best_silhouette_gower, (algorithm, gower.gower_matrix(smpl), range_n_clusters)),
"gow_gap": (GapStatGow.fit_predict, (algorithm, K, pd.DataFrame(smpl_gap)))
}
return d[method]
def auc_bootstrap(algo_name, method, n, sample_iloc, orig_iloc):
if algo_name == 'KMedoids':
clusterer = (
KMedoids(n_clusters=n).fit(sample_iloc)
if method.startswith("eucl")
else KMedoids(n_clusters=n, metric='precomputed').fit(gower.gower_matrix(sample_iloc))
)
l_orig = pd.DataFrame(clusterer.predict(orig_iloc))
elif algo_name == 'AgglomerativeClustering':
clusterer = (
AgglomerativeClustering(n_clusters=n, linkage='average').fit(sample_iloc)
if method.startswith("eucl")
else AgglomerativeClustering(n_clusters=n, metric='precomputed', linkage='average').fit(gower.gower_matrix(sample_iloc))
)
l_orig = (
pd.DataFrame(clusterer.fit_predict(orig_data_iloc))
if method.startswith("eucl")
else pd.DataFrame(clusterer.fit_predict(gower_matrix))
)
elif algo_name == 'SpectralClustering':
clusterer = (
SpectralClustering(n_clusters=n, affinity='nearest_neighbors').fit(sample_iloc)
if method.startswith("eucl")
else SpectralClustering(n_clusters=n, affinity='precomputed').fit(1 - gower.gower_matrix(sample_iloc))
)
l_orig = (
pd.DataFrame(clusterer.fit_predict(orig_data_iloc))
if method.startswith("eucl")
else pd.DataFrame(clusterer.fit_predict(gower.gower_matrix(sample_iloc)))
)
return clusterer, l_orig
Iterations
(adding nested loop for 4 methods)
# Iterate through clustering algorithms and perform bootstrap iterations
algo_aucs_boots, algo_optimisms, algo_ls_orig_dfs = {}, {}, {}
for algo_name, algo in algorithms.items():
method_aucs_boots, method_optimisms, method_ls_orig_dfs = [
{ method: [] for method in ["eucl_sil", "eucl_gap", "gow_sil", "gow_gap"] }
for _ in range(3)
]
# Perform bootstrap iterations
for i in range(n_bootstraps):
sample = resample(orig_data, replace=True, n_samples=4509)
sample_gap = resample(orig_data_gap, replace=True, n_samples=4509)
y_boot = sample['y_orig']
for method in ["eucl_sil", "eucl_gap", "gow_sil", "gow_gap"]:
func, args = retrieve_n(method, algo, sample.iloc[:, 1:], range_n_clusters)
n = func(*args)[0] if method.endswith("sil") else func(*args)
clusterer, l_orig = auc_bootstrap(algo_name, method, n, sample.iloc[:, 1:], orig_data.iloc[:, 1:])
# (2) AUC Bootstrap
try:
l_boot = pd.DataFrame(clusterer.labels_)
df_boot = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot": np.ravel(l_boot)})
m_ = sm.formula.glm(formula="y_boot ~ l_boot", family=sm.families.Binomial(), data=df_boot).fit()
# Calculate AUC Boot
y_pred_boot = m_.predict(l_boot)
auc_boot = roc_auc_score(y_boot, y_pred_boot)
# Calculate AUC Original
y_pred_orig = m_.predict(pd.DataFrame(df_orig['labels_orig']))
auc_orig = roc_auc_score(df_orig['y_orig'], y_pred_orig)
# Calculate optimism and append results
optimism = auc_boot - auc_orig
# Append to the respective lists
method_aucs_boots[method].append(auc_boot)
method_optimisms[method].append(optimism)
method_ls_orig_dfs[method].append(l_orig.reset_index(drop=True))
except Exception as e:
print("An error occurred:", e)
continue
# Aggregate and compile
optimism_sil_avg = {k: np.sum(d) / n_bootstraps for k,d in method_optimisms.items() }
std_auc_boot = {k: np.std(d) for k,d in method_aucs_boot.items() }
method_ls_orig_dfs = {k:pd.concat(dfs, axis=1) for k,dfs in method_ls_orig_dfs.items()}
# Assign to the algo-level dicts
algo_aucs_boots[algo] = method_aucs_boot
algo_optimisms[algo] = method_optimisms
algo_ls_orig_dfs[algo] = method_ls_orig_dfs
print(
f"Algorithm: {algo_name}\n"
f"Optimism Metrics:\n"
f" - Euclidean Silhouette: {optimism_sil_avg["eucl_sil"]}\n"
f" - Euclidean Gap: {optimism_sil_avg["eucl_gap"]}\n"
f" - Gower Silhouette: {optimism_sil_avg["gow_sil"]}\n"
f" - Gower Gap: {optimism_sil_avg["gow_gap"]}\n"
f"Standard Deviation of Bootstrap AUCs:\n"
f" - Euclidean Silhouette: {std_auc_boot["eucl_sil"]}\n"
f" - Euclidean Gap: {std_auc_boot["eucl_gap"]}\n"
f" - Gower Silhouette: {std_auc_boot["gow_sil"]}\n"
f" - Gower Gap: {std_auc_boot["gow_gap"]}\n"
)
NOTE: Above has not been tested. Please take the time to review and adjust as needed.