pythoncluster-analysisprecisionunsupervised-learning

Rand Index function (clustering performance evaluation)


As far as I know, there is no package available for Rand Index in python while for Adjusted Rand Index you have the option of using sklearn.metrics.adjusted_rand_score(labels_true, labels_pred).

I wrote the code for Rand Score and I am going to share it with others as the answer to the post.


Solution

  • from scipy.misc import comb
    from itertools import combinations
    import numpy as np
    
    def check_clusterings(labels_true, labels_pred):
        """Check that the two clusterings matching 1D integer arrays."""
        labels_true = np.asarray(labels_true)
        labels_pred = np.asarray(labels_pred)    
        # input checks
        if labels_true.ndim != 1:
            raise ValueError(
                "labels_true must be 1D: shape is %r" % (labels_true.shape,))
        if labels_pred.ndim != 1:
            raise ValueError(
                "labels_pred must be 1D: shape is %r" % (labels_pred.shape,))
        if labels_true.shape != labels_pred.shape:
            raise ValueError(
                "labels_true and labels_pred must have same size, got %d and %d"
                % (labels_true.shape[0], labels_pred.shape[0]))
        return labels_true, labels_pred
    
    def rand_score (labels_true, labels_pred):
    """given the true and predicted labels, it will return the Rand Index."""
        check_clusterings(labels_true, labels_pred)
        my_pair = list(combinations(range(len(labels_true)), 2)) #create list of all combinations with the length of labels.
        def is_equal(x):
            return (x[0]==x[1])
        my_a = 0
        my_b = 0
        for i in range(len(my_pair)):
                if(is_equal((labels_true[my_pair[i][0]],labels_true[my_pair[i][1]])) == is_equal((labels_pred[my_pair[i][0]],labels_pred[my_pair[i][1]])) 
                   and is_equal((labels_pred[my_pair[i][0]],labels_pred[my_pair[i][1]])) == True):
                    my_a += 1
                if(is_equal((labels_true[my_pair[i][0]],labels_true[my_pair[i][1]])) == is_equal((labels_pred[my_pair[i][0]],labels_pred[my_pair[i][1]])) 
                   and is_equal((labels_pred[my_pair[i][0]],labels_pred[my_pair[i][1]])) == False):
                    my_b += 1
        my_denom = comb(len(labels_true),2)
        ri = (my_a + my_b) / my_denom
        return ri
    

    As a simple example:

    labels_true = [1, 1, 0, 0, 0, 0]
    labels_pred = [0, 0, 0, 1, 0, 1]
    rand_score (labels_true, labels_pred)
    #0.46666666666666667
    

    There are probably some ways to improve it and make it more pythonic. If you have any suggestion, you may improve it.

    I found this implementation which seems faster.

    import numpy as np
    from scipy.misc import comb
    def rand_index_score(clusters, classes):
        tp_plus_fp = comb(np.bincount(clusters), 2).sum()
        tp_plus_fn = comb(np.bincount(classes), 2).sum()
        A = np.c_[(clusters, classes)]
        tp = sum(comb(np.bincount(A[A[:, 0] == i, 1]), 2).sum()
                 for i in set(clusters))
        fp = tp_plus_fp - tp
        fn = tp_plus_fn - tp
        tn = comb(len(A), 2) - tp - fp - fn
        return (tp + tn) / (tp + fp + fn + tn)
    

    As a simple example:

    labels_true = [1, 1, 0, 0, 0, 0]
    labels_pred = [0, 0, 0, 1, 0, 1]
    rand_index_score (labels_true, labels_pred)
    #0.46666666666666667