python list matplotlib cluster-analysis dynamic-list

How to make a dynamic clustering

I have a nested list that represents dynamic image data, with each inner list containing a number and the x-center coordinate of a bounding box.I need to clustering and get approximate bounding box x coordinate the inetger value with sorting. The expected output is [[100,1] ,[120,1],[151,3],[180,0]]

I've visualized this data using Matplotlib to gain insights into its distribution across the space. Now, I'm looking to perform clustering on this data, approximate the coordinates, and sort it based on the x-center values. To help you better understand, I'll provide the code and a plot as well.

import matplotlib.pyplot as plt
data = [
    [[100, 1], [120, 1], [150, 3]],
    [[101, 1], [119, 1], [151, 3]],
    [[102, 1], [123, 1], [150, 3], [180, 0]],
    [[103, 1], [154, 3], [180, 0]],
    [[103, 1], [152, 3], [181, 0]],
    [[101, 1], [120, 1], [180, 0]],
    [[101, 1], [120, 1], [150, 3]],
    [[101, 1], [119, 1], [150, 3]],
    [[102, 1], [123, 1], [150, 3], [181, 0]],
    [[103, 1], [153, 3], [181, 0]],
    [[103, 1], [152, 3], [181, 0]],
    [[101, 1], [120, 1], [180, 0]],
    [[100, 1], [122, 1], [150, 3], [181, 0]],
    # Add more data here
]

x_values = []
y_values = []

for sublist in data:
    for subsublist in sublist:
        x_values.append(subsublist[0])
        y_values.append(subsublist[1])

print("X Values:", x_values)
print("Y Values:", y_values)


plt.scatter(x_values, y_values)
plt.xlabel('X Values')
plt.ylabel('Y Values')
plt.title('Scatter Plot of X and Y Values')
plt.grid(True)
plt.show()

Solution

You can use the DBSCAN clustering algorithm to achieve this. Here is the code and result:

import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN

data = [
    [[100, 1], [120, 1], [150, 3]],
    [[101, 1], [119, 1], [151, 3]],
    [[102, 1], [123, 1], [150, 3], [180, 0]],
    [[103, 1], [154, 3], [180, 0]],
    [[103, 1], [152, 3], [181, 0]],
    [[101, 1], [120, 1], [180, 0]],
    [[101, 1], [120, 1], [150, 3]],
    [[101, 1], [119, 1], [150, 3]],
    [[102, 1], [123, 1], [150, 3], [181, 0]],
    [[103, 1], [153, 3], [181, 0]],
    [[103, 1], [152, 3], [181, 0]],
    [[101, 1], [120, 1], [180, 0]],
    [[100, 1], [122, 1], [150, 3], [181, 0]],
    # Add more data here
]

x_values = []
y_values = []

for sublist in data:
    for subsublist in sublist:
        x_values.append(subsublist[0])
        y_values.append(subsublist[1])

X = np.array(list(zip(x_values, y_values)))
clustering = DBSCAN(eps=10, min_samples=2).fit(X)
centroids = []
for label in np.unique(clustering.labels_):
    cluster_values = X[clustering.labels_ == label]
    # Could also use median here
    centroid = np.mean(cluster_values, axis=0)
    print(centroid)
    centroids.append(centroid)

centroids = np.array(centroids)

plt.scatter(X[:, 0], X[:, 1])
plt.scatter(centroids[:, 0], centroids[:, 1], s=1000, facecolors="none", edgecolors="r")
plt.show()