pythonmachine-learningscikit-learnpcamahalanobis

Plot Mahalanobis distance as ellipse for PCA is missing part of circle edge


I am working creating a PCA combined with a Mahalanobis distance function to remove any outliers detected from the PCA transformation. I have come across this article which uses R: Mahalanobis output in R.
However, how do I implement the Mahalanobis with Python and get an ellipse to centre around the cluster.

Some sample data and plot:

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EmpiricalCovariance, MinCovDet

iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names

steps = [
        ("preprocessing", StandardScaler()),
        ("pca", PCA()),]
        
pipe = Pipeline(steps)
searchPCA = GridSearchCV(pipe, { "pca__n_components": [1, 2,3]}, n_jobs = 3)
searchPCA.fit(X)
DF = searchPCA.transform(X)

plt.figure()
colors = ["navy", "turquoise", "darkorange"]
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(
        DF[y == i, 0], DF[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
    )
    plt.legend(loc="best", shadow=False, scatterpoints=1)
    plt.title("PCA of IRIS dataset")
    xx, yy = np.meshgrid(
            np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
            np.linspace(plt.ylim()[0], plt.ylim()[1], 100),)
    zz = np.c_[xx.ravel(), yy.ravel()]
    
    robust_cov = MinCovDet().fit(DF[y == i,0:2])
    
    mahal_robust_cov = robust_cov.mahalanobis(zz)
    
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)

    plt.contour(
            xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, linestyles="dashed", levels=[3])

EDIT: This plots a contour and I managed to get all three ellipses for each cluster however I get part of the circle edge missing, how to fill these all in?

enter image description here


Solution

  • Please add plt.xlim() and plt.ylim():

    plt.legend(loc="best", shadow=False, scatterpoints=1)
    plt.title("PCA of IRIS dataset")
    plt.xlim(-3,4) # ADD
    plt.ylim(-3,3) # ADD
    xx, yy = np.meshgrid(
            np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
            np.linspace(plt.ylim()[0], plt.ylim()[1], 100),)
    zz = np.c_[xx.ravel(), yy.ravel()]
    

    enter image description here