I am working creating a PCA combined with a Mahalanobis distance function to remove any outliers detected from the PCA transformation. I have come across this article which uses R: Mahalanobis output in R.
However, how do I implement the Mahalanobis with Python and get an ellipse to centre around the cluster.
Some sample data and plot:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EmpiricalCovariance, MinCovDet
iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
steps = [
("preprocessing", StandardScaler()),
("pca", PCA()),]
pipe = Pipeline(steps)
searchPCA = GridSearchCV(pipe, { "pca__n_components": [1, 2,3]}, n_jobs = 3)
searchPCA.fit(X)
DF = searchPCA.transform(X)
plt.figure()
colors = ["navy", "turquoise", "darkorange"]
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
plt.scatter(
DF[y == i, 0], DF[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
)
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("PCA of IRIS dataset")
xx, yy = np.meshgrid(
np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
np.linspace(plt.ylim()[0], plt.ylim()[1], 100),)
zz = np.c_[xx.ravel(), yy.ravel()]
robust_cov = MinCovDet().fit(DF[y == i,0:2])
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
plt.contour(
xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles="dashed", levels=[3])
EDIT: This plots a contour and I managed to get all three ellipses for each cluster however I get part of the circle edge missing, how to fill these all in?
Please add plt.xlim()
and plt.ylim()
:
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("PCA of IRIS dataset")
plt.xlim(-3,4) # ADD
plt.ylim(-3,3) # ADD
xx, yy = np.meshgrid(
np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
np.linspace(plt.ylim()[0], plt.ylim()[1], 100),)
zz = np.c_[xx.ravel(), yy.ravel()]