pythonclassscikit-learnhdbscan

Python HDBScan class always fails on second iteration before even entering first function


I am attempting to look at conglomerated outlier information, utilizing several different SKLearn, HDBScan, and custom outlier detection classes. However, for some reason I am consistently running into an error where any class utilizing HDBScan cannot be iterated over. All other Sklearn and Custom classes can. The issue I am getting seems to consistently occur on the second pass of the HDBScan class and instantly happens upon algorithm.fit(tmp). Upon debugging the script, it looks like the error is thrown before even getting to the first line of the Class.

Any help? Below is the minimum viable reproduction:

import numpy as np
import pandas as pd
import hdbscan
from sklearn.datasets import make_blobs
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope

class DBClass():

    def __init__(self, random = None):
        self.random = random

    def fit(self, data):

        self.train_data = data

        cluster = hdbscan.HDBSCAN()
        cluster.fit(self.train_data)
        self.fit = cluster

    def predict(self, data):

        self.predict_data = data

        if self.train_data.equals(self.predict_data):
            return self.fit.probabilities_  


def OutlierEnsemble(df, anomaly_algorithms = None, num_slices = 5, num_columns = 7, outliers_fraction = 0.05):

    if isinstance(df, np.ndarray):
        df = pd.DataFrame(df)

    assert isinstance(df, pd.DataFrame)

    if not anomaly_algorithms: 
        anomaly_algorithms = [
            ("Robust covariance",
                EllipticEnvelope(contamination=outliers_fraction)),
            ("One-Class SVM",
                OneClassSVM(nu=outliers_fraction,
                                kernel="rbf")),
            ("Isolation Forest",
                IsolationForest(contamination=outliers_fraction)),
            ("HDBScan LOF",
                DBClass()),
        ]

    data = []
    for i in range(1, num_slices + 1):
        data.append(df.sample(n = num_columns, axis = 1, replace = False))

    predictions = []
    names = []

    for tmp in data:
        counter = 0
        for name, algorithm in anomaly_algorithms:
            algorithm.fit(tmp)
            predictions.append(algorithm.predict(tmp))
            counter += 1
            names.append(f"{name}{counter}")


    return predictions

blobs, labels = make_blobs(n_samples=3000, n_features=12)
OutlierEnsemble(blobs)

The error provided is not the most helpful.

Traceback (most recent call last):

  File "<ipython-input-4-e1d4b63cfccd>", line 75, in <module>
    OutlierEnsemble(blobs)

  File "<ipython-input-4-e1d4b63cfccd>", line 66, in OutlierEnsemble
    algorithm.fit(tmp)

TypeError: 'HDBSCAN' object is not callable

Solution

  • In your DBClass.fit, DBClass.fit is unintentionally redefined.

    You could perhaps use something like,

    class DBClass():
    
        def __init__(self, random = None):
            self.random = random
    
        def fit(self, data):
    
            self.train_data = data
    
            cluster = hdbscan.HDBSCAN()
            cluster.fit(self.train_data)
            self.myfit = cluster   # save calculated cluster
    
        def predict(self, data):
    
            self.predict_data = data
    
            if self.train_data.equals(self.predict_data):
                return self.myfit.probabilities_  # use calculated cluster