pythonloopsfor-loopcluster-analysis

Complicated for-loop in Python


I am trying to calculate the optimism-adjusted AUC's via a double for-loop over n = n_eucl_sil ... n_gow_gap and for three clustering algorithms. This should result in three print statements (see end). I am encountering two errors: first, "index out of bounds" for the K-medoids algorithm; second, print statements for each bootstrap iteration, instead of one (averaged) print. This is possibly due to wrong code indentation.

algorithms = {
    'KMedoids': KMedoids,
    'AgglomerativeClustering': AgglomerativeClustering,
    'SpectralClustering': SpectralClustering
}

# Iterate through clustering algorithms and calculate AUC Apparent
for algorithm, labels in cluster_labels.items():
    # Create DataFrame for logistic regression with cluster labels
    df_orig = pd.DataFrame({"y_orig": np.ravel(y_dich), "labels_orig": np.ravel(labels)})

    # Fit logistic regression model
    model = sm.formula.glm(formula="y_orig ~ labels_orig", family=sm.families.Binomial(), data=df_orig).fit()

    # Calculate AUC Apparent
    y_pred_apparent = model.predict(df_orig['labels_orig'])
    auc_apparent = roc_auc_score(df_orig['y_orig'], y_pred_apparent)

    # Append results to auc_apparents DataFrame
    auc_apparents = auc_apparents.append(
        {
            'method': algorithm,
            'auc_apparent': auc_apparent
        }, ignore_index=True)

# Display AUC Apparent values
print(auc_apparents)
print()

# Iterate through clustering algorithms and perform bootstrap iterations
for algorithm_name, algorithm in algorithms.items():

    optimisms_eucl_sil, optimisms_eucl_gap, optimisms_gow_sil, optimisms_gow_gap = [], [], [], []
    ls_orig_eucl_sil, ls_orig_eucl_gap, ls_orig_gow_sil, ls_orig_gow_gap = [pd.DataFrame() for _ in range(4)]

    # Perform bootstrap iterations
    for i in range(n_bootstraps):
        sample = resample(orig_data, replace=True, n_samples=4509)
        sample_gap = resample(orig_data_gap, replace=True, n_samples=4509)
        y_boot = sample['y_orig']

        n_eucl_sil = best_silhouette_euclidean(algorithm, sample.iloc[:, 1:], range_n_clusters)[0]
        n_eucl_gap = GapStatEucl.fit_predict(algorithm, K, sample_gap.iloc[:,1:])
        n_gow_sil = best_silhouette_gower(algorithm, gower.gower_matrix(sample.iloc[:, 1:]), range_n_clusters)[0]
        n_gow_gap = GapStatGow.fit_predict(algorithm, K, pd.DataFrame(sample_gap.iloc[:, 1:]))

        # (2) AUC Bootstrap
        try:
            if algorithm_name == 'KMedoids':        
                clusterer_eucl_sil = KMedoids(n_clusters=n_eucl_sil).fit(sample.iloc[:, 1:])
                clusterer_eucl_gap = KMedoids(n_clusters=n_eucl_gap).fit(sample.iloc[:, 1:])
                clusterer_gow_sil = KMedoids(n_clusters=n_gow_sil, metric='precomputed').fit(gower.gower_matrix(sample.iloc[:,1:]))
                clusterer_gow_gap = KMedoids(n_clusters=n_gow_gap, metric='precomputed').fit(gower.gower_matrix(sample.iloc[:,1:]))

                l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.predict(orig_data.iloc[:, 1:]))
                l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.predict(orig_data.iloc[:, 1:]))
                l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.predict(orig_data.iloc[:, 1:]))
                l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.predict(orig_data.iloc[:, 1:]))

            elif algorithm_name == 'AgglomerativeClustering':
                clusterer_eucl_sil = AgglomerativeClustering(n_clusters=n_eucl_sil,linkage='average').fit(sample.iloc[:, 1:])
                clusterer_eucl_gap = AgglomerativeClustering(n_clusters=n_eucl_gap,linkage='average').fit(sample.iloc[:, 1:])
                clusterer_gow_sil = AgglomerativeClustering(n_clusters=n_gow_sil, metric='precomputed', linkage='average').fit(gower.gower_matrix(sample.iloc[:, 1:]))
                clusterer_gow_gap = AgglomerativeClustering(n_clusters=n_gow_gap, metric='precomputed', linkage='average').fit(gower.gower_matrix(sample.iloc[:, 1:]))

                l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.fit_predict(orig_data.iloc[:, 1:]))
                l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.fit_predict(orig_data.iloc[:, 1:]))
                l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.fit_predict(gower_matrix))
                l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.fit_predict(gower_matrix))

            elif algorithm_name == 'SpectralClustering':
                clusterer_eucl_sil = SpectralClustering(n_clusters=n_eucl_sil, affinity='nearest_neighbors').fit(sample.iloc[:, 1:])
                clusterer_eucl_gap = SpectralClustering(n_clusters=n_eucl_gap, affinity='nearest_neighbors').fit(sample.iloc[:, 1:])
                clusterer_gow_sil = SpectralClustering(n_clusters=n_gow_sil, affinity='precomputed').fit(1 - gower.gower_matrix(sample.iloc[:,1:]))
                clusterer_gow_gap = SpectralClustering(n_clusters=n_gow_gap, affinity='precomputed').fit(1 - gower.gower_matrix(sample.iloc[:, 1:]))

                l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.fit_predict(orig_data.iloc[:, 1:]))
                l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.fit_predict(orig_data.iloc[:, 1:]))
                l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.fit_predict(gower.gower_matrix(sample.iloc[:,1:])))
                l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.fit_predict(gower.gower_matrix(sample.iloc[:,1:])))

            l_boot_eucl_sil = pd.DataFrame(clusterer_eucl_sil.labels_)
            l_boot_eucl_gap = pd.DataFrame(clusterer_eucl_gap.labels_)
            l_boot_gow_sil = pd.DataFrame(clusterer_gow_sil.labels_)
            l_boot_gow_gap = pd.DataFrame(clusterer_gow_gap.labels_)

            df_boot_eucl_sil = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_eucl_sil": np.ravel(l_boot_eucl_sil)})
            df_boot_eucl_gap = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_eucl_gap": np.ravel(l_boot_eucl_gap)})
            df_boot_gow_sil = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_gow_sil": np.ravel(l_boot_gow_sil)})
            df_boot_gow_gap = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_gow_gap": np.ravel(l_boot_gow_gap)})

            m_eucl_sil = sm.formula.glm(formula="y_boot ~ l_boot_eucl_sil", family=sm.families.Binomial(), data=df_boot_eucl_sil).fit()
            m_eucl_gap = sm.formula.glm(formula="y_boot ~ l_boot_eucl_gap", family=sm.families.Binomial(), data=df_boot_eucl_gap).fit()
            m_gow_sil = sm.formula.glm(formula="y_boot ~ l_boot_gow_sil", family=sm.families.Binomial(), data=df_boot_gow_sil).fit()
            m_gow_gap = sm.formula.glm(formula="y_boot ~ l_boot_gow_gap", family=sm.families.Binomial(), data=df_boot_gow_gap).fit()

            # Calculate AUC Bootstrap
            models = [m_eucl_sil, m_eucl_gap, m_gow_sil, m_gow_gap]
            label_dfs = [l_boot_eucl_sil, l_boot_eucl_gap, l_boot_gow_sil, l_boot_gow_gap]
            auc_scores = []

            # Iterate over models and label dataframes
            for model, label_df in zip(models, label_dfs):
                # Predictions
                y_pred_boot = model.predict(label_df)

                # AUC score
                auc_boot = roc_auc_score(y_boot, y_pred_boot)

                # Append to the list of AUC scores
                auc_scores.append(auc_boot)

            # Separate the AUC scores for each algorithm
            auc_boot_eucl_sil, auc_boot_eucl_gap, auc_boot_gow_sil, auc_boot_gow_gap = [auc_scores[i] for i in range(4)]

            # Append to the respective lists
            aucs_boot_eucl_sil.append(auc_boot_eucl_sil)
            aucs_boot_eucl_gap.append(auc_boot_eucl_gap)
            aucs_boot_gow_sil.append(auc_boot_gow_sil)
            aucs_boot_gow_gap.append(auc_boot_gow_gap)

            # Calculate AUC Original
            y_pred_orig_eucl_sil = m_eucl_sil.predict(pd.DataFrame(df_orig['labels_orig']))
            y_pred_orig_eucl_gap = m_eucl_gap.predict(pd.DataFrame(df_orig['labels_orig']))
            y_pred_orig_gow_sil = m_gow_sil.predict(pd.DataFrame(df_orig['labels_orig']))
            y_pred_orig_gow_gap = m_gow_gap.predict(pd.DataFrame(df_orig['labels_orig']))

            auc_orig_eucl_sil = roc_auc_score(df_orig['y_orig'], y_pred_orig_eucl_sil)
            auc_orig_eucl_gap = roc_auc_score(df_orig['y_orig'], y_pred_orig_eucl_gap)
            auc_orig_gow_sil = roc_auc_score(df_orig['y_orig'], y_pred_orig_gow_sil)
            auc_orig_gow_gap = roc_auc_score(df_orig['y_orig'], y_pred_orig_gow_gap)

            # Calculate optimism and append results
            optimism_eucl_sil = auc_boot_eucl_sil - auc_orig_eucl_sil
            optimism_eucl_gap = auc_boot_eucl_gap - auc_orig_eucl_gap
            optimism_gow_sil = auc_boot_gow_sil - auc_orig_gow_sil
            optimism_gow_gap = auc_boot_gow_gap - auc_orig_gow_gap

            optimisms_eucl_sil.append(optimism_eucl_sil)
            optimisms_eucl_gap.append(optimism_eucl_gap)
            optimisms_gow_sil.append(optimism_gow_sil)
            optimisms_gow_gap.append(optimism_gow_gap)

            ls_orig_eucl_sil = pd.concat([ls_orig_eucl_sil, l_orig_eucl_sil.reset_index(drop=True)], axis=1)
            ls_orig_eucl_gap = pd.concat([ls_orig_eucl_gap, l_orig_eucl_gap.reset_index(drop=True)], axis=1)
            ls_orig_gow_sil = pd.concat([ls_orig_gow_sil, l_orig_gow_sil.reset_index(drop=True)], axis=1)
            ls_orig_gow_gap = pd.concat([ls_orig_gow_gap, l_orig_gow_gap.reset_index(drop=True)], axis=1)

            # Calculate average optimism
            optimism_eucl_sil_avg = np.sum(optimisms_eucl_sil) / n_bootstraps
            optimism_eucl_gap_avg = np.sum(optimisms_eucl_gap) / n_bootstraps
            optimism_gow_sil_avg = np.sum(optimisms_gow_sil) / n_bootstraps
            optimism_gow_gap_avg = np.sum(optimisms_gow_gap) / n_bootstraps

            std_auc_boot_eucl_sil = np.std(aucs_boot_eucl_sil)
            std_auc_boot_eucl_gap = np.std(aucs_boot_eucl_gap)
            std_auc_boot_gow_sil = np.std(aucs_boot_gow_sil)
            std_auc_boot_gow_gap = np.std(aucs_boot_gow_gap)
        
        except Exception as e:
            print("An error occurred:", e)
            continue
            
        print(f"Algorithm: {algorithm_name}\n"
          f"Optimism Metrics:\n"
          f"  - Euclidean Silhouette: {optimism_eucl_sil_avg}\n"
          f"  - Euclidean Gap: {optimism_eucl_gap_avg}\n"
          f"  - Gower Silhouette: {optimism_gow_sil_avg}\n"
          f"  - Gower Gap: {optimism_gow_gap_avg}\n"

          f"Standard Deviation of Bootstrap AUCs:\n"
          f"  - Euclidean Silhouette: {std_auc_boot_eucl_sil}\n"
          f"  - Euclidean Gap: {std_auc_boot_eucl_gap}\n"
          f"  - Gower Silhouette: {std_auc_boot_eucl_gap}\n"
          f"  - Gower Gap: {std_auc_boot_gow_gap}\n")

Output

How can I fix this loop, such that I get one (averaged) print statement per algorithm, where the print statement contains the optimism-adjusted AUC estimates in accordance with the code procedure (the 4x repeated code lines, one for each n)?


Solution

  • While immediate answer involves de-indenting, consider refactoring for DRY-er code using functions, conditional expressions, and various data objects such as tuples and dictionaries:

    Functions (using conditional logic/assignment)

    def retrieve_n(method, algorithm, smpl, smpl_gap, range_n_clusters):
        # DICT OF TUPLES CONTAINING FUNCTION AND ARGS
        d = {
            "eucl_sil": (best_silhouette_euclidean, (algorithm, smpl, range_n_clusters)),
            "eucl_gap": (GapStatEucl.fit_predict, (algorithm, K, smpl_gap)),
            "gow_sil": (best_silhouette_gower, (algorithm, gower.gower_matrix(smpl), range_n_clusters)),
            "gow_gap": (GapStatGow.fit_predict, (algorithm, K, pd.DataFrame(smpl_gap)))
        }
    
        return d[method]
    
    def auc_bootstrap(algo_name, method, n, sample_iloc, orig_iloc):
        if algo_name == 'KMedoids':  
            clusterer = (
                KMedoids(n_clusters=n).fit(sample_iloc)
                if method.startswith("eucl") 
                else KMedoids(n_clusters=n, metric='precomputed').fit(gower.gower_matrix(sample_iloc))
            )
    
            l_orig = pd.DataFrame(clusterer.predict(orig_iloc))
    
    
        elif algo_name == 'AgglomerativeClustering':
            clusterer = (
                AgglomerativeClustering(n_clusters=n, linkage='average').fit(sample_iloc)
                if method.startswith("eucl") 
                else AgglomerativeClustering(n_clusters=n, metric='precomputed', linkage='average').fit(gower.gower_matrix(sample_iloc))
            )
            
            l_orig = (
                pd.DataFrame(clusterer.fit_predict(orig_data_iloc))
                if method.startswith("eucl") 
                else pd.DataFrame(clusterer.fit_predict(gower_matrix))
            )
    
        elif algo_name == 'SpectralClustering':
            clusterer = (
                SpectralClustering(n_clusters=n, affinity='nearest_neighbors').fit(sample_iloc)
                if method.startswith("eucl") 
                else SpectralClustering(n_clusters=n, affinity='precomputed').fit(1 - gower.gower_matrix(sample_iloc))
            )
    
            l_orig = (
                pd.DataFrame(clusterer.fit_predict(orig_data_iloc))
                if method.startswith("eucl") 
                else pd.DataFrame(clusterer.fit_predict(gower.gower_matrix(sample_iloc)))
            )
    
        return clusterer, l_orig
    

    Iterations (adding nested loop for 4 methods)

    # Iterate through clustering algorithms and perform bootstrap iterations
    algo_aucs_boots, algo_optimisms, algo_ls_orig_dfs = {}, {}, {}
    
    for algo_name, algo in algorithms.items():
        method_aucs_boots, method_optimisms, method_ls_orig_dfs = [
            { method: [] for method in ["eucl_sil", "eucl_gap", "gow_sil", "gow_gap"] }
            for _ in range(3)
        ]
    
        # Perform bootstrap iterations
        for i in range(n_bootstraps):
            sample = resample(orig_data, replace=True, n_samples=4509)
            sample_gap = resample(orig_data_gap, replace=True, n_samples=4509)
            y_boot = sample['y_orig']
    
            for method in ["eucl_sil", "eucl_gap", "gow_sil", "gow_gap"]:
                func, args = retrieve_n(method, algo, sample.iloc[:, 1:], range_n_clusters)
                n = func(*args)[0] if method.endswith("sil") else func(*args)
                clusterer, l_orig = auc_bootstrap(algo_name, method, n, sample.iloc[:, 1:], orig_data.iloc[:, 1:])
    
                # (2) AUC Bootstrap
                try:
                    l_boot = pd.DataFrame(clusterer.labels_)
                    df_boot = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot": np.ravel(l_boot)})
                    m_ = sm.formula.glm(formula="y_boot ~ l_boot", family=sm.families.Binomial(), data=df_boot).fit()
                    
                    # Calculate AUC Boot
                    y_pred_boot = m_.predict(l_boot)
                    auc_boot = roc_auc_score(y_boot, y_pred_boot)
    
                    # Calculate AUC Original
                    y_pred_orig = m_.predict(pd.DataFrame(df_orig['labels_orig']))
                    auc_orig = roc_auc_score(df_orig['y_orig'], y_pred_orig)
    
                    # Calculate optimism and append results
                    optimism = auc_boot - auc_orig
    
                    # Append to the respective lists
                    method_aucs_boots[method].append(auc_boot)
                    method_optimisms[method].append(optimism)
                    method_ls_orig_dfs[method].append(l_orig.reset_index(drop=True))
    
                except Exception as e:
                    print("An error occurred:", e)
                    continue
    
        # Aggregate and compile
        optimism_sil_avg = {k: np.sum(d) / n_bootstraps for k,d in method_optimisms.items() }
        std_auc_boot = {k: np.std(d) for k,d in method_aucs_boot.items() }
        method_ls_orig_dfs = {k:pd.concat(dfs, axis=1) for k,dfs in method_ls_orig_dfs.items()}
    
        # Assign to the algo-level dicts
        algo_aucs_boots[algo] = method_aucs_boot
        algo_optimisms[algo] = method_optimisms
        algo_ls_orig_dfs[algo] = method_ls_orig_dfs
                
        print(
          f"Algorithm: {algo_name}\n"
          f"Optimism Metrics:\n"
          f"  - Euclidean Silhouette: {optimism_sil_avg["eucl_sil"]}\n"
          f"  - Euclidean Gap: {optimism_sil_avg["eucl_gap"]}\n"
          f"  - Gower Silhouette: {optimism_sil_avg["gow_sil"]}\n"
          f"  - Gower Gap: {optimism_sil_avg["gow_gap"]}\n"
    
          f"Standard Deviation of Bootstrap AUCs:\n"
          f"  - Euclidean Silhouette: {std_auc_boot["eucl_sil"]}\n"
          f"  - Euclidean Gap: {std_auc_boot["eucl_gap"]}\n"
          f"  - Gower Silhouette: {std_auc_boot["gow_sil"]}\n"
          f"  - Gower Gap: {std_auc_boot["gow_gap"]}\n"
        )
    

    NOTE: Above has not been tested. Please take the time to review and adjust as needed.