pythonhashmd5hashlib

Why am I getting different hash for duplicated files?


I need to check for duplicated and corrupted files in a folder with literally millions of files.

First, I was trying this way:

hash_files = pd.DataFrame({"Path":[],
                           "Hash":[]})

rootdir = "//?/Z:/"

error_files = []
for root, subdir, files in os.walk(rootdir):
    print(root)
    for filename in files:

        # String with file path
        source = root + "/" + filename

        with open(source, "rb") as file_obj:
            
            # File reading
            file_contents = file_obj.read() 
            file_obj.close()
            
        # Hash identification
        md5_hash = hashlib.blake2b(file_contents).hexdigest() 
        
        hash_files.loc[len(hash_files) + 1] = [source, md5_hash]

But the file reading part was taking too long to run in large files (and there are a lot). So I tried another way that seemed way quicker:

hash_files = pd.DataFrame({"Path":[],
                           "Hash":[]})

rootdir = "//?/Z:/"

error_files = []
for root, subdir, files in os.walk(rootdir):
    print(root)
    for filename in files:

        # String with file path
        source = root + "/" + filename  
            
        # Hash identification
        md5_hash = hashlib.blake2b(source.encode('utf-8')).hexdigest()
        
        hash_files.loc[len(hash_files) + 1] = [source, md5_hash]

But in this last way, I'm getting different hash for duplicated files and I thought they had to be the same.

Anyone knows what is wrong or how I can get the right hash for all these files in a quicker way?


Solution

  • This is a good case for multiprocessing.

    The subprocesses should calculate the hash and return the Path and hexdigest.

    Something like this will probably be as fast as you can possibly make it:

    from hashlib import blake2b
    from multiprocessing import Pool
    from pathlib import Path, PosixPath
    from typing import Iterator
    from time import perf_counter
    
    CHUNK = 64 * 1024
    SRCDIR = Path("/Users/CtrlZ")
    
    
    def process(path: Path) -> tuple[Path, str]:
        h = blake2b()
        with open(path, "rb") as f:
            while b := f.read(CHUNK):
                h.update(b)
        return path, h.hexdigest()
    
    
    def getfiles(d: Path) -> Iterator[Path]:
        for e in d.iterdir():
            if e.is_file():
                yield e
    
    
    def main():
        start = perf_counter()
        count = 0
        with Pool() as pool:
            result = pool.map_async(process, getfiles(SRCDIR))
            for p, h in result.get():
                print(p, h)
                count += 1
        duration = perf_counter() - start
        print(f"Processed {count} files in {duration:.2f} seconds")
    
    
    if __name__ == "__main__":
        main()