I have a large corpus of documents that I want to merge if they have a notable n-gram overlap (in my case, I'm considering bigrams). Consider the list of sets:
corpus = [{'example', 'bigram'}, {'another', 'example'}, {'some', 'outlier'}]
I have the following code to compute the similarity matrix:
import numpy as np
sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
for j in range(i+1, len(corpus)):
sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])
np.fill_diagonal(sim_matrix, 1)
Where the get_ngram_overlap
function is defined as follows:
def get_ngram_overlap(ngrams_s1, ngrams_s2):
if min(len(ngrams_s1), len(ngrams_s2)) == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/min(len(ngrams_s1), len(ngrams_s2))
The result is a MxM matrix, where the M is the number of n-grams in my corpus
print(sim_matrix)
>> [[1. 0.5 0.]
[0. 1. 0.]
[0. 0. 1.]]
The problem is, when the M grows large, the code is really slow and this becomes really computationally expensive, because of the necessity to compare all unique pairs. Are there ways to compute it more efficiently? Like using an optimized distance matrix computation with a custom similarity metric that would work for sets of strings. Any help is appreciated!
You can make the computation faster to first build an index and run the get_ngram_overlap()
only on combinations which have common words:
import numpy as np
from itertools import combinations
# make the items in corpus frozensets (to make them hashable)
corpus = [frozenset({'example', 'bigram'}), frozenset({'another', 'example'}), frozenset({'some', 'outlier'})]
def get_ngram_overlap(ngrams_s1, ngrams_s2):
mn = min(len(ngrams_s1), len(ngrams_s2))
if mn == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/mn
index, nums = {}, {}
for i, b in enumerate(corpus):
for word in b:
index.setdefault(word, []).append(b)
nums.setdefault(b, []).append(i)
index2 = {}
for k, v in index.items():
for c in combinations(v, 2):
index2[c] = get_ngram_overlap(*c)
sim_matrix = np.zeros((len(corpus), len(corpus)))
for a, b in index2:
for x in nums[a]:
for y in nums[b]:
sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]
np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)
Prints:
[[1. 0.5 0. ]
[0. 1. 0. ]
[0. 0. 1. ]]
A benchmark with 10k bigrams of 500 different words:
import random
import numpy as np
from timeit import timeit
from itertools import combinations
random.seed(123)
corpus = [frozenset({f'word{random.randint(1, 500)}', f'word{random.randint(1, 500)}'}) for _ in range(10_000)]
def get_ngram_overlap(ngrams_s1, ngrams_s2):
mn = min(len(ngrams_s1), len(ngrams_s2))
if mn == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/mn
def fn1():
sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
for j in range(i+1, len(corpus)):
sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])
np.fill_diagonal(sim_matrix, 1)
return sim_matrix
def fn2():
index, nums = {}, {}
for i, b in enumerate(corpus):
for word in b:
index.setdefault(word, []).append(b)
nums.setdefault(b, []).append(i)
index2 = {}
for k, v in index.items():
for c in combinations(v, 2):
index2[c] = get_ngram_overlap(*c)
sim_matrix = np.zeros((len(corpus), len(corpus)))
for a, b in index2:
for x in nums[a]:
for y in nums[b]:
sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]
np.fill_diagonal(sim_matrix, 1)
return sim_matrix
assert np.array_equal(fn1(), fn2())
t1 = timeit(fn1, number=1)
t2 = timeit(fn2, number=1)
print(t1)
print(t2)
Prints on my machine (Python 3.10/AMD Ryzen 5700x):
19.253960204077885
0.37714757514186203