pythonmultiprocessingpython-multithreadingsequencematcher

Why python thread and process not working?


I have a big jsn list which contains a lot of string elements with possible duplicate values. I need to check each element for similarity and add duplicate list item keys in dubs list to remove these items from jsn list.

Because of size of jsn list i decided to use Threading in my code to speed up second for loop execution and waiting time

But Thread/Process is not working as i expected.

The code below with Thread inside changes nothing in performance and also dubs list is empty after Threads join is finished

I tried without success.join() but i still got empty dubs list and no change in performance.

The main problem -> dubs list is empty before starting deleting duplicates.

from threading import Thread
from multiprocessing import Process
from difflib import SequenceMatcher

# Searching for dublicates in array
def finddubs(jsn,dubs,a):
    for b in range(len(jsn)):
        if ((jsn[a] == jsn[b]) or (SequenceMatcher(None, jsn[a], jsn[b]).ratio() > 40)):
                dubs.append(b) # add dublicate list element keys to dublicates array
 
# Start threading
threads = []
for a in range(len(jsn)):
    t = Thread(target=finddubs, args=(jsn,dubs,a))
    threads.append(t)
    t.start()
for thr in threads:
    thr.join()

# Delete duplicate list items 
for d in dubs:
    k = int(d)
    del jsn[k]

Without threading code is working


Solution

  • You need to use multiprocessing instead of threading if you want to speedup your computations. Please read about GIL for detailed information on topic.

    An example of how multiprocessing can be used for this task:

    import multiprocessing
    from difflib import SequenceMatcher
    from uuid import uuid4
    
    # Let's generate a large list with random data
    # where we have few duplicates: "abc" indices: 0, 1_001 ; "b" - indices 1_002, 1_003
    jsn = ['abc'] + [str(uuid4()) for _ in range(1_000)] + ['abc', 'b', 'b']
    
    
    def compare_strings(a: int, b: int):
        if ((jsn[a] == jsn[b]) or (SequenceMatcher(None, jsn[a], jsn[b]).ratio() > 40)):
            return a, b
    
    
    # now we are comparing all possible pairs using multiprocessing
    with multiprocessing.Pool(processes=10) as pool:
        results = pool.starmap(compare_strings, [(i, j) for i in range(len(jsn)) for j in range(i + 1, len(jsn))])
    
    for result in results:
        if result is not None:
            a, b = result
            print(f"Duplicated pair: {a} {b} {jsn[b]}")
            # delete duplicates
    

    modification of your code that should work:

    from difflib import SequenceMatcher
    from threading import Thread
    from uuid import uuid4
    
    # Let's generate a large list with random data
    # where we have few duplicates: "abc" indices: 1, 10_001 ; "b" - indices 10_002, 10_003
    jsn = ["abc"] + [str(uuid4()) for _ in range(1_00)] + ["abc", "b", "b"]
    dubs = []
    
    # Searching for dublicates in array
    def finddubs(jsn, dubs, a):
        for b in range(a + 1, len(jsn)):
            if (jsn[a] == jsn[b]) or (SequenceMatcher(None, jsn[a], jsn[b]).ratio() > 40):
                print(a, b)
                dubs.append(b)  # add dublicate list element keys to dublicates array
    
    
    # Start threading
    threads = []
    for a in range(len(jsn)):
        t = Thread(target=finddubs, args=(jsn, dubs, a))
        threads.append(t)
        t.start()
    for thr in threads:
        thr.join()
    
    # Delete duplicate list items
    print(dubs)
    for d in dubs:
        k = int(d)
        del jsn[k]