I am attempting to look at conglomerated outlier information, utilizing several different SKLearn, HDBScan, and custom outlier detection classes. However, for some reason I am consistently running into an error where any class utilizing HDBScan cannot be iterated over. All other Sklearn and Custom classes can. The issue I am getting seems to consistently occur on the second pass of the HDBScan class and instantly happens upon algorithm.fit(tmp). Upon debugging the script, it looks like the error is thrown before even getting to the first line of the Class.
Any help? Below is the minimum viable reproduction:
import numpy as np
import pandas as pd
import hdbscan
from sklearn.datasets import make_blobs
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
class DBClass():
def __init__(self, random = None):
self.random = random
def fit(self, data):
self.train_data = data
cluster = hdbscan.HDBSCAN()
cluster.fit(self.train_data)
self.fit = cluster
def predict(self, data):
self.predict_data = data
if self.train_data.equals(self.predict_data):
return self.fit.probabilities_
def OutlierEnsemble(df, anomaly_algorithms = None, num_slices = 5, num_columns = 7, outliers_fraction = 0.05):
if isinstance(df, np.ndarray):
df = pd.DataFrame(df)
assert isinstance(df, pd.DataFrame)
if not anomaly_algorithms:
anomaly_algorithms = [
("Robust covariance",
EllipticEnvelope(contamination=outliers_fraction)),
("One-Class SVM",
OneClassSVM(nu=outliers_fraction,
kernel="rbf")),
("Isolation Forest",
IsolationForest(contamination=outliers_fraction)),
("HDBScan LOF",
DBClass()),
]
data = []
for i in range(1, num_slices + 1):
data.append(df.sample(n = num_columns, axis = 1, replace = False))
predictions = []
names = []
for tmp in data:
counter = 0
for name, algorithm in anomaly_algorithms:
algorithm.fit(tmp)
predictions.append(algorithm.predict(tmp))
counter += 1
names.append(f"{name}{counter}")
return predictions
blobs, labels = make_blobs(n_samples=3000, n_features=12)
OutlierEnsemble(blobs)
The error provided is not the most helpful.
Traceback (most recent call last):
File "<ipython-input-4-e1d4b63cfccd>", line 75, in <module>
OutlierEnsemble(blobs)
File "<ipython-input-4-e1d4b63cfccd>", line 66, in OutlierEnsemble
algorithm.fit(tmp)
TypeError: 'HDBSCAN' object is not callable
In your DBClass.fit
, DBClass.fit
is unintentionally redefined.
You could perhaps use something like,
class DBClass():
def __init__(self, random = None):
self.random = random
def fit(self, data):
self.train_data = data
cluster = hdbscan.HDBSCAN()
cluster.fit(self.train_data)
self.myfit = cluster # save calculated cluster
def predict(self, data):
self.predict_data = data
if self.train_data.equals(self.predict_data):
return self.myfit.probabilities_ # use calculated cluster