I am new and still learning Python.
I am trying to identify unique files from files in my main folder, i.e. xx and files in the subfolder, i.e. yy. The duplicated files can have different file names.
i.e. E:/username/Desktop/xx/filename.pdf
E:/username/Desktop/xx/filename1.pdf
E:/username/Desktop/xx/yy/filename1_23.pdf
The duplicated files are filename1.pdf and file_name1_23.pdf. My expected output is a list of unique files. If there are duplicated file, take the first file.
Expected output
i.e.['E:/username/Desktop/xx/filename.pdf',
'E:/username/Desktop/xx/filename1.pdf']
The issue is the codes are able to run but unable to identify unique files. Could you please help me to check my code logic problem?
Thank you so much for your time.
import os
import fitz
path = 'E:/username/Desktop/xx'
pdf_paths = [os.path.join(root, name)
for root, dirs, files in os.walk(path)
for name in files
if name.endswith('.pdf')]
def extract_text_from_pdf(pdf_path):
text = ''
pdf_document = fitz.open(pdf_path)
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
text += page.get_text()
pdf_document.close()
return text
def compare_pdfs(pdf_path1, pdf_path2):
text1 = extract_text_from_pdf(pdf_path1)
text2 = extract_text_from_pdf(pdf_path2)
if text1==text2:
return True
else:
return False
def compare_multiple_pdfs(pdf_paths):
unique_files = []
for i in range(len(pdf_paths)):
for j in range(i+1, len(pdf_paths)):
pdf_path1 = pdf_paths[i]
pdf_path2 = pdf_paths[j]
if compare_pdfs(pdf_path1, pdf_path2):
for existing_file in unique_files:
if compare_pdfs(pdf_path1, existing_file):
break
else:
unique_files.append(pdf_path1)
else:
for existing_file in unique_files:
if compare_pdfs(pdf_path1, existing_file) and compare_pdfs(pdf_path2, existing_file):
break
elif compare_pdfs(pdf_path1, existing_file):
unique_files.append(pdf_path2)
elif compare_pdfs(pdf_path2, existing_file):
unique_files.append(pdf_path1)
else:
unique_files.append(pdf_path1)
unique_files.append(pdf_path2)
return unique_files
compare_multiple_pdfs(pdf_paths)
You can calculate a hash for each file and compare them. For practical purposes a hash is unique for each input, effectively working as a unique identifier in this case.
Something like this:
import os
import hashlib
from collections import defaultdict
def file_hash(filepath):
hasher = hashlib.sha256()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hasher.update(chunk)
return hasher.hexdigest()
def find_duplicate_files(directory):
duplicates = defaultdict(list)
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith(".pdf"):
file_path = os.path.join(root, file)
file_checksum = file_hash(file_path)
duplicates[file_checksum].append(file_path)
return {k: v for k, v in duplicates.items() if len(v) > 1}
directory = "" # <= add the path to your files here
duplicate_files = find_duplicate_files(directory)
if duplicate_files:
print("duplicate files found:")
for hash_value, files in duplicate_files.items():
print(f"hash: {hash_value}")
for file_path in files:
print(f"- {file_path}")
else:
print("no duplicate files found")
If duplicates are found this will print:
duplicate files found:
hash: e745ece17e34fd40e1f202ce8956cba607b60fa0095a1b2747f4b8e3611f75e5
- directory/file1.pdf
- directory/file2.pdf