I'm attempting to find the "best hits" in a similarity matrix (i.e. an mxn matrix where index along each axis corresponds to the ith position in vector m and the jth position in vector n). The simplest way to explain this is finding the indices of the highest values in a matrix iteratively, excluding previously selected rows and columns. This results in min(m,n) indices chosen.
Here's my minimum reproducible example of my current implementation, using pandas:
import numpy as np
import pandas as pd
def pairwise_best_hit(sim):
xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
seq1_hits = []
seq2_hits = []
while len(df):
index1 = df.iloc[0]['index1']
index2 = df.iloc[0]['index2']
seq1_hits.append(index1)
seq2_hits.append(index2)
df = df[(df['index1']!=index1)&(df['index2']!=index2)]
return [seq1_hits,seq2_hits]
and for a matrix
sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])
pairwise_best_hit(sim)
returns
[[1, 2, 0], [1, 3, 2]]
Figured an edit would be the best way to respond to all comments simultaneously. Re: typical data size – m and n are anywhere from 250 to 1000 and the values in the matrix are floats.
Now, for the results on a matrix of my actual data, which is about 300x350. Slightly tweaking the answers from LastDuckStanding, Julien, and Axel Donath, we have:
def original(sim):
xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
output_list = []
while len(df):
index1 = df.iloc[0]['index1']
index2 = df.iloc[0]['index2']
score = df.iloc[0]['sim']
output_list.append((int(index1),int(index2),score))
df = df[(df['index1']!=index1)&(df['index2']!=index2)]
return output_list
def lastduckstanding(input_matrix):
mat = input_matrix.copy()
idxs = np.argsort(mat, None)
output_list = []
hit_is_set = set()
hit_js_set = set()
num_entries = min(mat.shape[0], mat.shape[1])
for idx in reversed(idxs):
i, j = divmod(idx, mat.shape[1])
if i in hit_is_set or j in hit_js_set:
continue
hit_is_set.add(i)
hit_js_set.add(j)
output_list.append((i,j,mat[i,j]))
if len(output_list) == num_entries:
break
return output_list
def julien(matrix: np.ndarray):
out = []
copy = matrix.copy()
for _ in range(min(copy.shape)):
ij = np.unravel_index(copy.argmax(), copy.shape)
indeces_plus_score = (ij[0],ij[1],copy[ij[0],ij[1]])
out.append(indeces_plus_score)
copy[ij[0], :] = -np.inf
copy[:, ij[1]] = -np.inf
return out
def axeldonath(arr, indices):
"""Find the maximum value in a 2D array recursively."""
if not np.any(np.isfinite(arr)):
return indices
idx_max = np.argmax(arr)
idxs = np.unravel_index(idx_max, arr.shape)
indeces_plus_score = (idxs[0],idxs[1],arr[idxs[0],idxs[1]])
arr[idxs[0], :] = -np.inf
arr[:, idxs[1]] = -np.inf
indices.append(indeces_plus_score)
return axeldonath(arr, indices)
def axeldonath_wrapper(similarity):
testsim = similarity.copy()
return axeldonath(testsim,[])
def timing_test(S,functionlist):
for function in functionlist:
testsim = S['matrix']
print(function)
%timeit function(testsim)
With the following timing results:
<function original at 0x7f405006d1c0>
287 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
<function lastduckstanding at 0x7f405006d120>
30 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
<function julien at 0x7f405006d260>
7.9 ms ± 30.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
<function axeldonath_wrapper at 0x7f405006d3a0>
16.9 ms ± 42.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Here's a candidate using numpy only, about 100x faster on your small example:
import numpy as np
import pandas as pd
import timeit
sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])
def mine(sim):
out = []
copy = sim.copy()
MIN = np.iinfo(copy.dtype).min # or -np.inf if using floats...
for _ in range(min(copy.shape)):
ij = np.unravel_index(copy.argmax(), copy.shape)
out.append(ij)
copy[ij[0]] = MIN
copy[:,ij[1]] = MIN
return np.transpose(out)
def yours(sim):
xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
seq1_hits = []
seq2_hits = []
while len(df):
index1 = df.iloc[0]['index1']
index2 = df.iloc[0]['index2']
seq1_hits.append(index1)
seq2_hits.append(index2)
df = df[(df['index1']!=index1)&(df['index2']!=index2)]
return [seq1_hits,seq2_hits]
assert np.all(mine(sim) == yours(sim))
%timeit yours(sim)
# 1.05 ms ± 6.78 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%timeit mine(sim)
# 8.18 µs ± 19.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
Comparison slowly degrades for larger arrays, but stays ahead (still 10x faster on 1000x1000 arrays):
sim = np.arange(10000)
np.random.shuffle(sim)
sim.shape = 100,100
%timeit yours(sim)
# 26.2 ms ± 64.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit mine(sim)
# 397 µs ± 534 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
sim = np.arange(1000000)
np.random.shuffle(sim)
sim.shape = 1000,1000
%timeit yours(sim)
# 2.45 s ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit mine(sim)
#203 ms ± 2.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)