pythonpymupdf

Identify duplicates files in folder and subfolder


I am new and still learning Python.

I am trying to identify unique files from files in my main folder, i.e. xx and files in the subfolder, i.e. yy. The duplicated files can have different file names.

i.e. E:/username/Desktop/xx/filename.pdf
     E:/username/Desktop/xx/filename1.pdf
     E:/username/Desktop/xx/yy/filename1_23.pdf

The duplicated files are filename1.pdf and file_name1_23.pdf. My expected output is a list of unique files. If there are duplicated file, take the first file.

Expected output

i.e.['E:/username/Desktop/xx/filename.pdf', 
     'E:/username/Desktop/xx/filename1.pdf']

The issue is the codes are able to run but unable to identify unique files. Could you please help me to check my code logic problem?

Thank you so much for your time.

import os 
import fitz

path = 'E:/username/Desktop/xx'
pdf_paths = [os.path.join(root, name)
             for root, dirs, files in os.walk(path)
             for name in files
             if name.endswith('.pdf')]

def extract_text_from_pdf(pdf_path):

    text = ''
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    pdf_document.close()

    return text

def compare_pdfs(pdf_path1, pdf_path2):

    text1 = extract_text_from_pdf(pdf_path1)
    text2 = extract_text_from_pdf(pdf_path2)

    if text1==text2:
        return True
    else:
        return False

def compare_multiple_pdfs(pdf_paths):

    unique_files = []

    for i in range(len(pdf_paths)):
        for j in range(i+1, len(pdf_paths)):

            pdf_path1 = pdf_paths[i]
            pdf_path2 = pdf_paths[j]

        if compare_pdfs(pdf_path1, pdf_path2):
            for existing_file in unique_files:
                if compare_pdfs(pdf_path1, existing_file):
                    break
                else: 
                    unique_files.append(pdf_path1)

        else: 
            for existing_file in unique_files:
                if compare_pdfs(pdf_path1, existing_file) and compare_pdfs(pdf_path2, existing_file):
                    break
                elif compare_pdfs(pdf_path1, existing_file):
                    unique_files.append(pdf_path2)
                elif compare_pdfs(pdf_path2, existing_file):
                    unique_files.append(pdf_path1)
                else:
                    unique_files.append(pdf_path1)
                    unique_files.append(pdf_path2)

        return unique_files

    
compare_multiple_pdfs(pdf_paths)

Solution

  • You can calculate a hash for each file and compare them. For practical purposes a hash is unique for each input, effectively working as a unique identifier in this case.

    Something like this:

    import os
    import hashlib
    from collections import defaultdict
    
    
    def file_hash(filepath):
        hasher = hashlib.sha256()
        with open(filepath, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hasher.update(chunk)
        return hasher.hexdigest()
    
    
    def find_duplicate_files(directory):
        duplicates = defaultdict(list)
        for root, _, files in os.walk(directory):
            for file in files:
                if file.lower().endswith(".pdf"):
                    file_path = os.path.join(root, file)
                    file_checksum = file_hash(file_path)
                    duplicates[file_checksum].append(file_path)
        return {k: v for k, v in duplicates.items() if len(v) > 1}
    
    
    directory = ""  # <= add the path to your files here
    duplicate_files = find_duplicate_files(directory)
    
    if duplicate_files:
        print("duplicate files found:")
        for hash_value, files in duplicate_files.items():
            print(f"hash: {hash_value}")
            for file_path in files:
                print(f"- {file_path}")
    else:
        print("no duplicate files found")
    

    If duplicates are found this will print:

    duplicate files found:
    hash: e745ece17e34fd40e1f202ce8956cba607b60fa0095a1b2747f4b8e3611f75e5
    - directory/file1.pdf
    - directory/file2.pdf