I need to check for duplicated and corrupted files in a folder with literally millions of files.
First, I was trying this way:
hash_files = pd.DataFrame({"Path":[],
"Hash":[]})
rootdir = "//?/Z:/"
error_files = []
for root, subdir, files in os.walk(rootdir):
print(root)
for filename in files:
# String with file path
source = root + "/" + filename
with open(source, "rb") as file_obj:
# File reading
file_contents = file_obj.read()
file_obj.close()
# Hash identification
md5_hash = hashlib.blake2b(file_contents).hexdigest()
hash_files.loc[len(hash_files) + 1] = [source, md5_hash]
But the file reading part was taking too long to run in large files (and there are a lot). So I tried another way that seemed way quicker:
hash_files = pd.DataFrame({"Path":[],
"Hash":[]})
rootdir = "//?/Z:/"
error_files = []
for root, subdir, files in os.walk(rootdir):
print(root)
for filename in files:
# String with file path
source = root + "/" + filename
# Hash identification
md5_hash = hashlib.blake2b(source.encode('utf-8')).hexdigest()
hash_files.loc[len(hash_files) + 1] = [source, md5_hash]
But in this last way, I'm getting different hash for duplicated files and I thought they had to be the same.
Anyone knows what is wrong or how I can get the right hash for all these files in a quicker way?
This is a good case for multiprocessing.
The subprocesses should calculate the hash and return the Path and hexdigest.
Something like this will probably be as fast as you can possibly make it:
from hashlib import blake2b
from multiprocessing import Pool
from pathlib import Path, PosixPath
from typing import Iterator
from time import perf_counter
CHUNK = 64 * 1024
SRCDIR = Path("/Users/CtrlZ")
def process(path: Path) -> tuple[Path, str]:
h = blake2b()
with open(path, "rb") as f:
while b := f.read(CHUNK):
h.update(b)
return path, h.hexdigest()
def getfiles(d: Path) -> Iterator[Path]:
for e in d.iterdir():
if e.is_file():
yield e
def main():
start = perf_counter()
count = 0
with Pool() as pool:
result = pool.map_async(process, getfiles(SRCDIR))
for p, h in result.get():
print(p, h)
count += 1
duration = perf_counter() - start
print(f"Processed {count} files in {duration:.2f} seconds")
if __name__ == "__main__":
main()