I am trying to read PDF files from a directory (path
) to extract individual images from each PDF and write to the same directory. However, I am unable to perform the following function on each file as my script only parses the last file in the directory. The code I am using is shown below:
pip install pymupdf
import os
import PyPDF2
import fitz # from pymupdf
import glob
path = "C:\\Users\\mdl518\\Desktop\\"
def pdf_extract():
for filename in glob.glob(os.path.join(path, "*.pdf"), recursive=True): # file path specifying the location of the PDF files
with open(os.path.join(os.getcwd(), filename),'rb') as f: # open/read the PDF files
pdf_document=fitz.open(filename)
for current_page in range(len(pdf_document)): # iterate over the total number of pages in each PDF
for image in pdf_document.getPageImageList(current_page):
xref=image[0] # initiates the cross-reference number for objects on the first page of the PDF
pix=fitz.Pixmap(pdf_document, xref)
if pix.n < 5: # capture all images and write to the file path
pix.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
else:
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
pix1 = None
pix = None
pdf_extract()
I have tried to use glob
, os.listdir()
, and os.walk()
to parse the individual PDFs but the best I have gotten is just pulling the images from the last PDF file to be read/written to the file path. Is there any easier way to go about this, or is it just a small tweak to my "glob" statement? Any assistance is most appreciated!
with open(os.path.join(os.getcwd(), filename),'rb') as f: # open/read the PDF files
is not needed, f
is never usedpix.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
pix1.writeImage(os.path.join(path,"page%s-%s.jpg") % (current_page, xref))
current_page
& xref
are not necessarily unique for each filepathlib
, part of the standard library, because it treats paths as objects with methods, unlike glob
and os
, which treat paths as strings. Also see Python 3's pathlib Module: Taming the File System
_{file.stem}
to the save path, to create a unique file namef-strings
for string formatting. Also see PEP 498 - Literal String Interpolationfrom pathlib import Path
import PyPDF2
import fitz
def pdf_extract(path_to_files: str):
path_to_files = Path(path_to_files) # convert the str to a pathlib object
for file in path_to_files.rglob('*.pdf'): # pathlib has rglob
pdf = fitz.open(file)
for current_page in range(len(pdf)):
for image in pdf.getPageImageList(current_page):
xref = image[0]
pix = fitz.Pixmap(pdf, xref)
if pix.n < 5:
pix.writeImage(str(file.parent / f'page{current_page}-{xref}_{file.stem}.jpg')) # updated filename
else:
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writeImage(str(file.parent / f'page{current_page}-{xref}_{file.stem}.jpg')) # updated filename
# path to files
path_to_files = r'C:\Users\mdl518\Desktop' # do not include the trailing backslash '\'
# call the function
pdf_extract(path_to_files)