pythoncluster-analysisdbscan

dbscan not making sense for small amounts of points


I am playing around with a dbscan example in order to see if it will work for me. In my case, I have clusters of a few points (3-5) close together with a fairly long distance in between clusters. I have tried to replicate the situation in the following code. I figured with a low epsilon and low min_samples,this should work, but instead it is telling me that it only sees 1 group (and 20 noise points?). Am I using this incorrectly, or is dbscan not good for this type of problem. I went with dbscan instead of kmeans because I dont know beforehand exactly how many clusters there will be (1-5).

from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
import numpy as np
import matplotlib.pyplot as plt

# Configuration options
num_samples_total = 20
cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
num_classes = len(cluster_centers)
#epsilon = 1.0
epsilon = 1e-5
#min_samples = 13
min_samples = 2

# Generate data
X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)

np.save('./clusters.npy', X)
X = np.load('./clusters.npy')

# Compute DBSCAN
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_

no_clusters = len(np.unique(labels) )
no_noise = np.sum(np.array(labels) == -1, axis=0)

print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)

# Generate scatter plot for training data
colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels))                #only set for 2 colors
plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True)
plt.title('Two clusters with data')
plt.xlabel('Axis X[0]')
plt.ylabel('Axis X[1]')
plt.show()

Solution

  • ended up going with kmeans and doing a modified elbow method:

    print(__doc__)
    
    # Author: Phil Roth <mr.phil.roth@gmail.com>
    # License: BSD 3 clause
    
    import numpy as np
    import matplotlib.pyplot as plt
    
    from sklearn.cluster import KMeans
    from sklearn.datasets import make_blobs
    
    # Configuration options
    num_samples_total = 20
    cluster_centers = [(3,3), (7,7),(7,3),(3,7),(5,5)]
    num_classes = len(cluster_centers)
    #epsilon = 1.0
    epsilon = 1e-5
    #min_samples = 13
    min_samples = 2
    
    # Generate data
    X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.05)
    random_state = 170
    
    #y_pred = KMeans(n_clusters=5, random_state=random_state).fit_predict(X)
    #plt.scatter(X[:, 0], X[:, 1], c=y_pred)
    #kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    #maybe I dont have to look for an elbow, just go until the value drops below 1.
    #also if I do go too far, it just means that the same shape will be shown twice.
    clusterIdx = 0
    inertia = 100
    while inertia > 1:
        clusterIdx = clusterIdx + 1
        kmeans = KMeans(n_clusters=clusterIdx, random_state=0).fit(X)
        inertia = kmeans.inertia_
        print(inertia)
    plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
    print(clusterIdx)
    plt.show()