Fastest way to find indices of highest value in a matrix iteratively and exclusionarily

For the sake of anyone who stumbles upon this in the future, I apologize – I had conceived of the problem and solution incorrectly. A proper implementation of finding the coordinates of pairwise best hits in a 2D matrix is:

def pairwise_best_hit(matrix):
    boolean_max_matrix = (matrix.max(axis=1,keepdims=1) == matrix) & (matrix.max(axis=0,keepdims=1) == matrix)
    return np.transpose(boolean_max_matrix.nonzero())

I'm attempting to find the "best hits" in a similarity matrix (i.e. an mxn matrix where index along each axis corresponds to the ith position in vector m and the jth position in vector n). The simplest way to explain this is finding the indices of the highest values in a matrix iteratively, excluding previously selected rows and columns. This results in min(m,n) indices chosen.

Here's my minimum reproducible example of my current implementation, using pandas:

import numpy as np
import pandas as pd

def pairwise_best_hit(sim):
    xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
    table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
    df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
    seq1_hits = []
    seq2_hits = []
    while len(df):
        index1 = df.iloc[0]['index1']
        index2 = df.iloc[0]['index2']
        seq1_hits.append(index1)
        seq2_hits.append(index2)
        df = df[(df['index1']!=index1)&(df['index2']!=index2)]
    return [seq1_hits,seq2_hits]

and for a matrix

sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])
pairwise_best_hit(sim)

returns

[[1, 2, 0], [1, 3, 2]]

Figured an edit would be the best way to respond to all comments simultaneously. Re: typical data size – m and n are anywhere from 250 to 1000 and the values in the matrix are floats.

Now, for the results on a matrix of my actual data, which is about 300x350. Slightly tweaking the answers from LastDuckStanding, Julien, and Axel Donath, we have:

def original(sim):
    xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
    table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
    df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
    output_list = []
    while len(df):
        index1 = df.iloc[0]['index1']
        index2 = df.iloc[0]['index2']
        score = df.iloc[0]['sim']
        output_list.append((int(index1),int(index2),score))
        df = df[(df['index1']!=index1)&(df['index2']!=index2)]
    return output_list

def lastduckstanding(input_matrix):
    mat = input_matrix.copy()
    idxs = np.argsort(mat, None)
    output_list = []
    hit_is_set = set()
    hit_js_set = set()
    num_entries = min(mat.shape[0], mat.shape[1])
    for idx in reversed(idxs):
        i, j = divmod(idx, mat.shape[1])
        if i in hit_is_set or j in hit_js_set:
            continue
        hit_is_set.add(i)
        hit_js_set.add(j)
        output_list.append((i,j,mat[i,j]))
        if len(output_list) == num_entries:
            break
    return output_list

def julien(matrix: np.ndarray):
    out = []
    copy = matrix.copy()
    for _ in range(min(copy.shape)):
        ij = np.unravel_index(copy.argmax(), copy.shape)
        indeces_plus_score = (ij[0],ij[1],copy[ij[0],ij[1]])
        out.append(indeces_plus_score)
        copy[ij[0], :] = -np.inf
        copy[:, ij[1]] = -np.inf
    return out

def axeldonath(arr, indices):
    """Find the maximum value in a 2D array recursively."""
    if not np.any(np.isfinite(arr)):
        return indices
    
    idx_max = np.argmax(arr)
    idxs = np.unravel_index(idx_max, arr.shape)
    indeces_plus_score = (idxs[0],idxs[1],arr[idxs[0],idxs[1]])
    arr[idxs[0], :] = -np.inf
    arr[:, idxs[1]] = -np.inf
    indices.append(indeces_plus_score)
    return axeldonath(arr, indices)

def axeldonath_wrapper(similarity):
    testsim = similarity.copy()
    return axeldonath(testsim,[])

def timing_test(S,functionlist):
    for function in functionlist:
        testsim = S['matrix']
        print(function)
        %timeit function(testsim)

With the following timing results:

<function original at 0x7f405006d1c0>
287 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
<function lastduckstanding at 0x7f405006d120>
30 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
<function julien at 0x7f405006d260>
7.9 ms ± 30.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
<function axeldonath_wrapper at 0x7f405006d3a0>
16.9 ms ± 42.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Solution

Here's a candidate using numpy only, about 100x faster on your small example:

import numpy as np
import pandas as pd
import timeit

sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])

def mine(sim):
    out = []
    copy = sim.copy()
    MIN = np.iinfo(copy.dtype).min # or -np.inf if using floats...
    for _ in range(min(copy.shape)):
        ij = np.unravel_index(copy.argmax(), copy.shape)
        out.append(ij)
        copy[ij[0]] = MIN
        copy[:,ij[1]] = MIN
    return np.transpose(out)

def yours(sim):
    xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
    table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
    df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
    seq1_hits = []
    seq2_hits = []
    while len(df):
        index1 = df.iloc[0]['index1']
        index2 = df.iloc[0]['index2']
        seq1_hits.append(index1)
        seq2_hits.append(index2)
        df = df[(df['index1']!=index1)&(df['index2']!=index2)]
    return [seq1_hits,seq2_hits]

assert np.all(mine(sim) == yours(sim))

%timeit yours(sim)
# 1.05 ms ± 6.78 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%timeit mine(sim)
# 8.18 µs ± 19.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

Comparison slowly degrades for larger arrays, but stays ahead (still 10x faster on 1000x1000 arrays):

sim = np.arange(10000)
np.random.shuffle(sim)
sim.shape = 100,100
%timeit yours(sim)
# 26.2 ms ± 64.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit mine(sim)
# 397 µs ± 534 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

sim = np.arange(1000000)
np.random.shuffle(sim)
sim.shape = 1000,1000
%timeit yours(sim)
# 2.45 s ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit mine(sim)
#203 ms ± 2.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)