pythonnumpydata-analysisk-means

Dunn index and inertia in kmeans algorithm


I have code which runs a KMeans algorithm on some data but i need it to now calculate the Dunn index and inertia for it but since the restrictions to this program is numpy, matplotlib and csv, no video online shows how to calculate the Dunn index with just these couple libraries, i am not very fond of math so implementing the actual math into the code is just too hard for me...

I have searched online for how to calculate dunn index and inertia in python with the limitations of numpy but everything used another library.

Here is the code:

import numpy as np
import matplotlib.pyplot as plt
import csv

def load_data(file_path):
    data = []
    with open(file_path, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)
        for row in csvreader:
            data.append([float(row[0]), float(row[1])])
    return np.array(data)

def calculate_distances(data, centers):
    data_with_distances = data.copy()
    num_centers = centers.shape[0]
    
    for i in range(num_centers):
        distances = np.sqrt(((data - centers[i]) ** 2).sum(axis=1))
        data_with_distances = np.column_stack((data_with_distances, distances))
    
    return data_with_distances

def get_clusters(data_with_distances):
    num_clusters = data_with_distances.shape[1] - 2 
    cluster_masks = []
    for i in range(num_clusters):
        mask = data_with_distances[:, i+2] == np.min(data_with_distances[:, 2:num_clusters+2], axis=1)
        cluster_masks.append(mask)
    
    clusters = [data_with_distances[mask, :] for mask in cluster_masks]
    return clusters

def calculate_centers(clusters):
    centers = np.array([cluster.mean(axis=0)[:2] for cluster in clusters])
    return centers

def plot_clusters(clusters, centers):
    colors = ['blue', 'red', 'green']
    for i, cluster in enumerate(clusters):
        plt.scatter(cluster[:, 0], cluster[:, 1], color=colors[i])
    for center in centers:
        plt.scatter(center[0], center[1], color='purple', marker='*', s=150)
    plt.xlabel('Household Total Assets')
    plt.ylabel('Annual Household Income')
    plt.title('K-means Clustering of Household Data')
    plt.show()

def run(data, num_clusters, max_iterations=100):
    current_centers = np.random.permutation(data)[:num_clusters]
    
    for iteration in range(max_iterations):
        data_with_distances = calculate_distances(data, current_centers)
        clusters = get_clusters(data_with_distances)
        current_centers = calculate_centers(clusters)

    plot_clusters(clusters, current_centers)
    
def main(file_path):
    data = load_data(file_path)
    for num_clusters in range(2, 11):
        run(data, num_clusters)

file_path = 'assessment2dmv.csv'
num_clusters = 3
main(file_path)

Solution

  • For the dunn index, you can use this github repo, where you find the following function :

    def dunn(k_list):
        """ Dunn index [CVI]
        
        Parameters
        ----------
        k_list : list of np.arrays
            A list containing a numpy array for each cluster |c| = number of clusters
            c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
        """
        deltas = np.ones([len(k_list), len(k_list)])*1000000
        big_deltas = np.zeros([len(k_list), 1])
        l_range = list(range(0, len(k_list)))
        
        for k in l_range:
            for l in (l_range[0:k]+l_range[k+1:]):
                deltas[k, l] = delta(k_list[k], k_list[l])
            
            big_deltas[k] = big_delta(k_list[k])
    
        di = np.min(deltas)/np.max(big_deltas)
        return di
    

    Regarding the inertia, you need to compute (with x the vectors of the cluster k, and mu the center of cluster k):

    Inetia Formula

    The code would look something like this :

    def get_inertia(clusters, centers)
        J=0
        for center in centers :
            for cluster in clusters :
                J+=(cluster - center)**2
        return J