I have a pdf with image like the one that I will show below, I would like to create a pdf with galleries instead of the entire photo with number, so including only the images with the same dimension.enter image description here
First install this into your python
pip install PyMuPDF Pillow
pip install fpdf
After this we extract the images with the below code.
import fitz # PyMuPDF
import io
from PIL import Image
# file path you want to extract images from
file = "1710.05006.pdf"
# open the file
pdf_file = fitz.open(file)
# iterate over PDF pages
for page_index in range(len(pdf_file)):
# get the page itself
page = pdf_file[page_index]
image_list = page.getImageList()
# printing number of images found in this page
if image_list:
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
print("[!] No images found on page", page_index)
for image_index, img in enumerate(page.getImageList(), start=1):
# get the XREF of the image
xref = img[0]
# extract the image bytes
base_image = pdf_file.extractImage(xref)
image_bytes = base_image["image"]
# get the image extension
image_ext = base_image["ext"]
# load it to PIL
image = Image.open(io.BytesIO(image_bytes))
# save it to local disk
image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))
After this we need to crop the images
from PIL import Image
import os.path, sys
path = "C:\\Users\\xie\\Desktop\\tiff\\Bmp"
dirs = os.listdir(path)
def crop():
for item in dirs:
fullpath = os.path.join(path,item) #corrected
if os.path.isfile(fullpath):
im = Image.open(fullpath)
f, e = os.path.splitext(fullpath)
imCrop = im.crop((30, 10, 1024, 1004)) #corrected
imCrop.save(f + 'Cropped.bmp', "BMP", quality=100)
After this compile back into PDF
from fpdf import FPDF
pdf = FPDF()
dims = [
(x, y,w,h),
(x, y,w,h),
(x, y,w,h),
(x, y,w,h)
for image_type in set([i[0] for i in image_list]):
imgs = list(filter(lambda x:image_type in x,image_list))
for i,j in zip(imgs,dims):
pdf.output("yourfile.pdf", "F")