pythonpdfphoto-gallery

How can I cut a pdf and create galleries of photo in python


I have a pdf with image like the one that I will show below, I would like to create a pdf with galleries instead of the entire photo with number, so including only the images with the same dimension.enter image description here


Solution

  • First install this into your python

    pip install PyMuPDF Pillow

    pip install fpdf

    After this we extract the images with the below code.

    import fitz # PyMuPDF
    import io
    from PIL import Image
    # file path you want to extract images from
    file = "1710.05006.pdf"
    # open the file
    pdf_file = fitz.open(file)
    # iterate over PDF pages
    for page_index in range(len(pdf_file)):
      # get the page itself
      page = pdf_file[page_index]
      image_list = page.getImageList()
      # printing number of images found in this page
      if image_list:
        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
      else:
          print("[!] No images found on page", page_index)
      for image_index, img in enumerate(page.getImageList(), start=1):
        # get the XREF of the image
        xref = img[0]
        # extract the image bytes
        base_image = pdf_file.extractImage(xref)
        image_bytes = base_image["image"]
        # get the image extension
        image_ext = base_image["ext"]
        # load it to PIL
        image = Image.open(io.BytesIO(image_bytes))
        # save it to local disk
        image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))
    

    After this we need to crop the images

    from PIL import Image
    import os.path, sys
    
    path = "C:\\Users\\xie\\Desktop\\tiff\\Bmp"
    dirs = os.listdir(path)
    
    def crop():
        for item in dirs:
            fullpath = os.path.join(path,item)         #corrected
             if os.path.isfile(fullpath):
                im = Image.open(fullpath)
                 f, e = os.path.splitext(fullpath)
                 imCrop = im.crop((30, 10, 1024, 1004)) #corrected
                 imCrop.save(f + 'Cropped.bmp', "BMP", quality=100)
    
    crop()
    

    After this compile back into PDF

    from fpdf import FPDF
    pdf = FPDF()
    dims = [
        (x, y,w,h),
        (x, y,w,h),
        (x, y,w,h),
        (x, y,w,h)
    ]
    
    for image_type in set([i[0] for i in image_list]):
        imgs = list(filter(lambda x:image_type in x,image_list))
        pdf.add_page()
        for i,j in zip(imgs,dims):
            pdf.image(i,j[0],j[1],j[2],j[3])
    pdf.output("yourfile.pdf", "F")