pythondataframescikit-learncluster-analysisoutliers

Python data filtering to remove outliers around a density plot


Referring to the below plot, I would like to remove all the outliers outside the density region marked in black color oval shape. I can use simple horizontal filters, like, -4 < data < 4. But outliers still remain. I am looking for any technique that precisely captures the density samples but drops the outliers.

enter image description here

Sample data:

x = array([1243. , 1261. ,  973. ,  842. ,  592. ,  499. , 1088. ,  739.5,
        567.5,  536.5,  854. ,  763. ,  671. ,  574. ,  498.5,  510.5,
        541.5,  544. ,  565.5,  482. ,  416. ,  412.5,  440. ,  540. ,
        652. ,  735. ,  878. , 1030. , 1022. , 1105. , 1034. , 1064. ,
       1089. , 1115. , 1145. , 1146. , 1111. , 1117. , 1140. , 1168. ,
        845. , 1173. ,  898. , 1091. ,  591. ,  570.5,  506. ,  592.5,
        682.5,  619.5,  663. ,  593. ,  470. ,  810. ,  694.5,  900. ,
        965. ,  954. ,  771. ,  608.5,  631. ,  593. ,  652. ,  428. ,
        486. ,  445. ,  395.5,  387.5,  383. ,  390. ,  408. ,  420. ,
        470. ,  543.5,  686. ,  550. ,  588. ,  556.5,  475.5,  606. ,
        617. ,  674. ,  571. ,  810. ,  913. ,  868. ,  621.5,  417. ,
        388. ,  428. ,  501. ,  586.5,  668. ,  739. ,  914. ,  829. ,
        966. ,  995. , 1008. ,  961. ])

y = array([[-10.6,   0.4,   0.1,  -0.1,  -0.5,   0. ],
       [-12.5,   1.5,   1.4,   0.9,   0.7,   0.7],
       [  4.5,   0.3,   0.2,   0. ,   0.6,   0.2],
       [  4.6,  -0.7,  -0.8,  -0.9,  -0.7,  -0.8],
       [  1.8,  -1.3,  -1.6,  -1.8,  -1.4,  -1.5],
       [ 10.4,  -1.4,  -1.5,  -1.1,  -1.2,  -1.1],
       [  1. ,  -0.6,  -0.5,  -0.3,  -0.2,  -0.2],
       [  0. ,   0.2,  -0.1,   0.1,  -0.1,  -0.1],
       [ -1.7,  -1.1,  -1. ,  -0.9,  -0.8,  -0.7],
       [  1.6,  -1. ,  -1.3,  -0.7,  -1. ,  -0.8],
       [  0.5,   0. ,   0. ,   0.3,   0.1,   0.3],
       [ -0.1,  -0.3,  -0.5,  -0.2,  -0.1,  -0.1],
       [  0.8,  -0.4,  -0.3,  -0.4,  -0.5,  -0.5],
       [ -1.3,  -0.8,  -1. ,  -1. ,  -1.3,  -1.1],
       [ -0.1,  -1.9,  -2.2,  -1.6,  -1.7,  -1.5],
       [ -0.9,  -1.3,  -1.5,  -1.9,  -1.7,  -2.1],
       [ -0.5,  -0.8,  -0.9,  -1.3,  -1.4,  -1.3],
       [ -0.2,  -0.6,  -0.5,  -0.8,  -1.6,  -0.9],
       [ -0.8,  -1.2,  -1. ,  -0.6,  -0.8,  -0.9],
       [ -1.2,  -0.6,  -1. ,  -0.4,  -1.3,  -0.4],
       [ -1.1,  -1. ,  -1.1,  -1.2,  -1. ,  -1.3],
       [ -0.8,  -0.9,  -1. ,  -1. ,  -2.7,  -1. ],
       [ -1.2,  -1.4,  -1.4,  -1.1,  -1.6,  -1.1],
       [ -0.4,  -0.6,  -0.7,  -0.5,   3.5,  -0.6],
       [  0.4,   0.1,   0. ,   0.1,   7.3,   0.1],
       [  0.2,  -0.1,   0. ,   0.5,   3.2,   0.6],
       [  0.3,   0.4,   0.2,   0.1, -16.7,   0.1],
       [  1.3,   1.1,   1.1,   1.4,  -2.1,   1.3],
       [  1.2,   1.4,   1.3,   1.3,  -1.7,   1.4],
       [  1.6,   1.2,   1.3,   1.5,   1.6,   1.6],
       [  0.8,   1.3,   1.3,   1.1,   1.1,   1.2],
       [  0.4,   1. ,   1.1,   0.6,   0.8,   0.7],
       [  1. ,   1.1,   1.3,   0.9,   1. ,   1.1],
       [  0. ,   0.3,   0.3,  -0.2,  -0.4,  -0.2],
       [  0.4,   0.6,   0.7,   0.1,  -0.1,   0.2],
       [  1.6,   1. ,   0.9,   0.6,   0.8,   0.6],
       [  0.3,   0.6,   0.6,   0.3,   0.4,   0.5],
       [  0.2,  -0.6,   0. ,   0.2,   0.1,   0.2],
       [ -0.3,   0.6,   0.2,  -0.1,  -0.2,  -0.2],
       [  0.4,   0.5,   0.6,   0.2,   0.2,   0.3],
       [ -0.1,   0.1,   0.1,  -0.2,   0. ,  -0.2],
       [ -0.3,  -0.6,  -0.5,  -0.3,  -0.4,  -0.2],
       [  0.2,   0.1,   0.3,   0.1,   0.1,   0. ],
       [ -0.3,  -0.5,  -0.5,  -0.7,  -0.7,  -0.6],
       [ -1.1,  -0.8,  -0.9,  -0.8,  -1. ,  -0.9],
       [ -2.9,  -1.9,  -2.2,  -2.3,  -2.3,  -2.4],
       [ -3. ,  -2.4,  -2.5,  -2.2,  -1.9,  -2.3],
       [ -0.4,  -1.5,  -1.4,  -0.8,  -0.6,  -0.9],
       [  0.4,   0.1,   0. ,   0.4,   0. ,   0.4],
       [ -0.1,  -0.8,  -0.7,   0. ,  -0.1,  -0.1],
       [ -0.3,  -0.6,  -0.3,  -0.2,  -0.2,  -0.2],
       [  0.4,   0.4,   0.2,  -0.1,  -0.1,  -0.1],
       [ -1.9,  -1.6,  -1.8,  -1.7,  -1.8,  -1.8],
       [ -0.5,  -0.8,  -0.8,  -0.6,  -0.1,  -0.6],
       [  0.8,   0.4,   0.5,   0.8,   0.7,   0.7],
       [  1.1,   1. ,   1. ,   0.7,   0.9,   0.8],
       [  0.7,   0.8,   0.9,   0.7,   0.6,   0.7],
       [  1. ,   1.1,   1. ,   0.8,   0.8,   0.8],
       [  0.2,   0.5,   0.4,   0.3,   0.1,   0.3],
       [ -0.3,  -1.2,  -1. ,  -0.7,  -0.5,  -0.8],
       [ -0.4,  -0.5,  -0.4,  -0.2,  -0.4,  -0.2],
       [  0. ,  -0.5,  -0.2,   0.3,   0.1,   0.2],
       [  0.2,   0. ,   0.1,   0.1,  -0.1,   0. ],
       [ -1.1,  -0.6,  -0.8,  -0.7,  -0.6,  -0.7],
       [ -0.8,  -0.9,  -0.9,  -0.6,  -0.7,  -0.6],
       [ -0.7,  -0.4,  -0.6,  -0.5,  -0.6,  -0.4],
       [ -1.6,  -1.2,  -1.4,  -1.1,  -1.2,  -1.3],
       [ -0.5,  -1.6,  -1.5,  -0.7,  -0.7,  -0.7],
       [ -1. ,  -1.2,  -1.3,  -0.6,  -0.9,  -0.8],
       [ -0.7,  -0.4,  -0.4,  -0.5,  -0.7,  -0.5],
       [ -0.1,  -0.2,  -0.3,   0. ,  -0.2,  -0.1],
       [ -0.5,  -0.4,  -0.4,  -0.3,  -0.3,  -0.2],
       [ -0.5,  -0.3,  -0.5,  -0.3,  -0.4,  -0.4],
       [  0.2,   0. ,   0. ,   0.1,   0. ,   0.1],
       [  0.9,   0.7,   0.8,   0.5,   0.6,   0.6],
       [  0.5,   0.6,   0.5,   0.6,   0.5,   0.5],
       [ -0.1,   0.2,   0.2,   0.4,   0.4,   0.4],
       [  0. ,   0.2,   0.1,   0.2,   0.2,   0.2],
       [ -0.4,  -0.2,  -0.4,  -0.2,  -0.3,  -0.2],
       [ -0.1,  -0.1,  -0.1,  -0.3,  -0.2,  -0.2],
       [  0.1,   0.4,   0.3,   0.1,   0.1,   0.1],
       [  0. ,   0. ,  -0.1,   0.2,   0.2,   0.3],
       [  0.7,   0.8,   0.9,   0.6,   0.6,   0.5],
       [  0.4,   0.2,   0.4,  -0.1,   0. ,   0.1],
       [  1.7,   1.4,   1.4,   1.2,   1.3,   1.2],
       [  0.9,   1. ,   1. ,   0.8,   1. ,   0.8],
       [  0.3,   0.5,   0.6,   0.4,   0.3,   0.3],
       [ -1.4,  -1. ,  -1.2,  -0.9,  -0.7,  -0.8],
       [ -1. ,  -1. ,  -1. ,  -1. ,  -1.2,  -1.1],
       [ -0.6,  -0.7,  -0.8,  -0.9,  -0.9,  -0.8],
       [ -0.5,  -0.8,  -0.7,  -0.3,  -0.4,  -0.4],
       [  0. ,  -0.2,  -0.1,  -0.3,  -0.5,  -0.3],
       [ -0.3,   0.2,   0. ,   0.1,   0. ,   0. ],
       [  0.8,   0.3,   0.4,   0.4,   0.5,   0.5],
       [  1.2,   1. ,   1.2,   0.8,   0.8,   0.6],
       [  1.7,   1.3,   1.4,   1.8,   1.8,   1.7],
       [  1.2,   1.1,   1.2,   1.1,   1.3,   1.3],
       [  1.5,   1.6,   1.6,   1.4,   1.7,   1.4],
       [  1.7,   1.8,   2. ,   1.5,   1.8,   1.5],
       [  0.6,   0.8,   1. ,   0.8,   1.3,   1. ]])

Solution

  • The code below uses an IsolationForest to separate outliers from inliers. The main parameter to tune is contamination=.

    It works on the sample data, but in order to generalise to the larger dataset you have we'd need a more representative sample.

    enter image description here

    import numpy as np
    from matplotlib import pyplot as plt
    
    #x, y from OP
    
    x_flat = np.repeat(x, y.shape[1])
    y_flat = y.ravel()
    xy = np.column_stack([x_flat, y_flat])
    
    #Outlier modelling
    from sklearn.ensemble import IsolationForest
    
    contamination_fraction = 5 / 100 #5%
    model = IsolationForest(contamination=contamination_fraction * 0.5)
    pred = model.fit_predict(xy)
    
    cmap = 'PiYG'
    plt.scatter(xy[:, 0], xy[:, 1], c=pred, edgecolor='none', cmap=cmap)
    plt.scatter(xy[:, 0], xy[:, 1], marker='.', s=5, color='white')
    plt.gcf().set_size_inches(7, 4)
    plt.xlabel('x')
    plt.ylabel('y')
    plt.gca().spines[:].set_visible(False)
    
    #Legend
    plt.scatter([], [], color=plt.get_cmap(cmap, 2)(0.), label='outlier')
    plt.scatter([], [], color=plt.get_cmap(cmap, 2)(1.), label='inlier')
    plt.scatter([], [], edgecolor='gray', color='white', label='data')
    plt.legend()