pythonmemoryunrar

Memory usage buildup while dictionary password checking using unrar lib in python


I wrote some crude code in python for checking passwords from a dictionary file for password protected rar archive. I even added some multi-threading, runs great. Unfortunately as the script goes through password list the memory usage starts growing. With >10k of tries the memory usage goes over 10GB... I couldn't find any methods in unrar lib documentation for freeing resources, and using gc.collector didn't help. How can I free the buffer after every password check? Here's the code:

import os
import os.path
import fileinput
import sys
from unrar import rarfile
import gc
import threading
import linecache

class App():
    def check(fraction, n):
        FILE = sys.argv[1]
        DICT = sys.argv[2]
        
        with open(DICT, 'r') as passdict:
            k = len([0 for l in passdict])
        
        counter = int(k / n)
        start = counter * fraction
        stop = counter * (fraction + 1)
        i = start
        print('fr: %s start: %s stop: %s'% (fraction, start, stop))
        while i < stop:
            p = linecache.getline(DICT, i)
            #print(i)              
            try:
                rf = rarfile.RarFile(FILE, pwd=p)
                if len(rf.namelist())>0:
                    print(p)
                    
                    break
                
                i += 1
                pass
            except rarfile.BadRarFile:
                gc.collect(generation=0)
                
                i += 1
                pass
        return
        

if __name__ == '__main__':
    for k in range(6):
        t = threading.Thread(target=App.check, args=(k, 6,))
        t.start()

Edit- Ok, so I changed to rarfile lib (pypi.org/project/rarfile), the memory doesn't buildup but the multi-threading stopped working, and also it works much slower... Looks like it all runs on one thread (task manager) :/


Solution

  • I think I fixed it. Unfortunately trying to use never library didn't help. Well, it helped with memory issue, but somehow it didn't want to do multithreading so it was really slow. I manged to fix the unrar library - I added call to _close function in exception statement. Looks like it just didn't free the resources when exiting with exception - like bad password. Maybe it does that with archives that have encrypted filenames (like in my case), but I didn't check. Here is modified code in rarfile.py of unrar library:

    def _read_header(self, handle):
        """Read current member header into a RarInfo object."""
        header_data = unrarlib.RARHeaderDataEx()
        try:
            res = unrarlib.RARReadHeaderEx(handle, ctypes.byref(header_data))
            rarinfo = RarInfo(header=header_data)
        except unrarlib.ArchiveEnd:
            return None
        except unrarlib.MissingPassword:
            raise RuntimeError("Archive is encrypted, password required")
        except unrarlib.BadPassword:
            raise RuntimeError("Bad password for Archive")
        except unrarlib.UnrarException as e:
            self._close(handle) #This line fixes the memory issue
            raise BadRarFile(str(e))
    
        return rarinfo
    

    And here is my modified script:

    import os
    import os.path
    import fileinput
    import sys
    import re
    from unrar import rarfile
    import threading
    import linecache
    
    class App():
        def check(fraction, n):
            FILE = sys.argv[1]
            DICT = sys.argv[2]
            
            with open(DICT, 'r') as passdict:
                k = len([0 for l in passdict])
            
            counter = int(k / n)
            start = counter * fraction
            stop = counter * (fraction + 1)
            i = start
            print('fr: %s start: %s stop: %s'% (fraction, start, stop))
            while i <= stop:
                p = re.search('\S*',linecache.getline(DICT, i)).group()
                try:
                    with rarfile.RarFile(FILE, pwd=p) as rf:
                        rf.extractall(path='D:\\',pwd=p)
                        if len(rf.namelist())>0:
                            print(p)
                            break
                        i += 1
                    pass
                except:
                    i += 1
                    pass
            print('stop')
            return
            
    
    if __name__ == '__main__':
        for k in range(8):
            t = threading.Thread(target=App.check, args=(k, 8,))
            t.start()