I have a big jsn list which contains a lot of string elements with possible duplicate values. I need to check each element for similarity and add duplicate list item keys in dubs list to remove these items from jsn list.
Because of size of jsn list i decided to use Threading in my code to speed up second for loop execution and waiting time
But Thread/Process is not working as i expected.
The code below with Thread inside changes nothing in performance and also dubs list is empty after Threads join is finished
I tried without success.join() but i still got empty dubs list and no change in performance.
The main problem -> dubs list is empty before starting deleting duplicates.
from threading import Thread
from multiprocessing import Process
from difflib import SequenceMatcher
# Searching for dublicates in array
def finddubs(jsn,dubs,a):
for b in range(len(jsn)):
if ((jsn[a] == jsn[b]) or (SequenceMatcher(None, jsn[a], jsn[b]).ratio() > 40)):
dubs.append(b) # add dublicate list element keys to dublicates array
# Start threading
threads = []
for a in range(len(jsn)):
t = Thread(target=finddubs, args=(jsn,dubs,a))
threads.append(t)
t.start()
for thr in threads:
thr.join()
# Delete duplicate list items
for d in dubs:
k = int(d)
del jsn[k]
Without threading code is working
You need to use multiprocessing
instead of threading
if you want to speedup your computations. Please read about GIL for detailed information on topic.
An example of how multiprocessing
can be used for this task:
import multiprocessing
from difflib import SequenceMatcher
from uuid import uuid4
# Let's generate a large list with random data
# where we have few duplicates: "abc" indices: 0, 1_001 ; "b" - indices 1_002, 1_003
jsn = ['abc'] + [str(uuid4()) for _ in range(1_000)] + ['abc', 'b', 'b']
def compare_strings(a: int, b: int):
if ((jsn[a] == jsn[b]) or (SequenceMatcher(None, jsn[a], jsn[b]).ratio() > 40)):
return a, b
# now we are comparing all possible pairs using multiprocessing
with multiprocessing.Pool(processes=10) as pool:
results = pool.starmap(compare_strings, [(i, j) for i in range(len(jsn)) for j in range(i + 1, len(jsn))])
for result in results:
if result is not None:
a, b = result
print(f"Duplicated pair: {a} {b} {jsn[b]}")
# delete duplicates
modification of your code that should work:
from difflib import SequenceMatcher
from threading import Thread
from uuid import uuid4
# Let's generate a large list with random data
# where we have few duplicates: "abc" indices: 1, 10_001 ; "b" - indices 10_002, 10_003
jsn = ["abc"] + [str(uuid4()) for _ in range(1_00)] + ["abc", "b", "b"]
dubs = []
# Searching for dublicates in array
def finddubs(jsn, dubs, a):
for b in range(a + 1, len(jsn)):
if (jsn[a] == jsn[b]) or (SequenceMatcher(None, jsn[a], jsn[b]).ratio() > 40):
print(a, b)
dubs.append(b) # add dublicate list element keys to dublicates array
# Start threading
threads = []
for a in range(len(jsn)):
t = Thread(target=finddubs, args=(jsn, dubs, a))
threads.append(t)
t.start()
for thr in threads:
thr.join()
# Delete duplicate list items
print(dubs)
for d in dubs:
k = int(d)
del jsn[k]