python loops automation directory pdf-parsing

Iterate Over Files (PDFs) to Run a Function

I am trying to read PDF files from a directory (path) to extract individual images from each PDF and write to the same directory. However, I am unable to perform the following function on each file as my script only parses the last file in the directory. The code I am using is shown below:

pip install pymupdf

import os
import PyPDF2
import fitz  # from pymupdf
import glob


path = "C:\\Users\\mdl518\\Desktop\\"

def pdf_extract():
    for filename in glob.glob(os.path.join(path, "*.pdf"), recursive=True):  # file path specifying the location of the PDF files
        with open(os.path.join(os.getcwd(), filename),'rb') as f: # open/read the PDF files
            pdf_document=fitz.open(filename)
            for current_page in range(len(pdf_document)): # iterate over the total number of pages in each PDF
                for image in pdf_document.getPageImageList(current_page):
                    xref=image[0]  # initiates the cross-reference number for objects on the first page of the PDF
                    pix=fitz.Pixmap(pdf_document, xref)
                    if pix.n < 5:  # capture all images and write to the file path
                        pix.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
                    else:
                        pix1 = fitz.Pixmap(fitz.csRGB, pix)
                        pix1.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
                        pix1 = None
                    pix = None

pdf_extract()

I have tried to use glob, os.listdir(), and os.walk() to parse the individual PDFs but the best I have gotten is just pulling the images from the last PDF file to be read/written to the file path. Is there any easier way to go about this, or is it just a small tweak to my "glob" statement? Any assistance is most appreciated!

Solution

There are two issues

with open(os.path.join(os.getcwd(), filename),'rb') as f: # open/read the PDF files is not needed, f is never used
The main issue is you're overwriting the images for each file
- pix.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
- pix1.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
- current_page & xref are not necessarily unique for each file

Updates

The code is updated with pathlib, part of the standard library, because it treats paths as objects with methods, unlike glob and os, which treat paths as strings. Also see Python 3's pathlib Module: Taming the File System
Added _{file.stem} to the save path, to create a unique file name
Use f-strings for string formatting. Also see PEP 498 - Literal String Interpolation

from pathlib import Path
import PyPDF2
import fitz

def pdf_extract(path_to_files: str):
    
    path_to_files = Path(path_to_files)  # convert the str to a pathlib object
    
    for file in path_to_files.rglob('*.pdf'):  # pathlib has rglob
        pdf = fitz.open(file)
        for current_page in range(len(pdf)):
            for image in pdf.getPageImageList(current_page):
                xref = image[0] 
                pix = fitz.Pixmap(pdf, xref)
                if pix.n < 5:
                    pix.writeImage(str(file.parent / f'page{current_page}-{xref}_{file.stem}.jpg'))  # updated filename
                else:
                    pix1 = fitz.Pixmap(fitz.csRGB, pix)
                    pix1.writeImage(str(file.parent / f'page{current_page}-{xref}_{file.stem}.jpg'))  # updated filename


# path to files
path_to_files = r'C:\Users\mdl518\Desktop'  # do not include the trailing backslash '\'

# call the function
pdf_extract(path_to_files)