pythonpymupdf

Extraction of position of an image in a PDF file


I am using pyMuPdf library to extract images from a pdf file. I want to get the position of the images (origin) and the size of them.
I could get the sizes. However I can't get the position correctly using:

def extract_images_from_pdf(_input_pdf_file_name, _output_folder):
    _pdf_file_document = fitz.open(_input_pdf_file_name)

    for _page_index, _page in enumerate(_pdf_file_document):  # Get the page itself
        _images_list = _pdf_file_document.get_page_images(pno=_page_index, full=True)  # Get image list for this page
    
    for _image_index, _image in enumerate(_images_list):
        _xref = _image[0]
        _base_image = _pdf_file_document.extract_image(_xref)
        _image_bytes = _base_image["image"]
        _image = PILImage.open(BytesIO(_image_bytes))
        
        _output_image_name = f"{_output_folder}/image_{_image_index + 1:04d}.png"
        _image.save(open(_output_image_name, "wb"))

I can process each images and extract them.
However,I am having trouble retrieving the original position of those images. I want to get each pages as an image, getting each images in that page and then get the origin point and the size of those extracted images. I am using the following code to get the origin, but from one reason, I am not getting the origin position correctly.

def get_image_origins(_input_pdf_file_name, _page_index):
    _pdf_file_document = fitz.open(_input_pdf_file_name)
    _image_list = _pdf_file_document.get_page_images(pno=_page_index, full=True)
    _image_bounding_boxes = []
    
    for _image_index, _image_item in enumerate(_image_list):
        _image_code_name = _image_item[7]
        
        # The format of _image_bounding_box is (x_min, y_min, x_max, y_max) for each images inside the page.
        _image_rects = _pdf_file_document[_page_index].get_image_rects(_image_code_name, transform=True)
        _image_box = _pdf_file_document[_page_index].get_image_bbox(_image_item, transform=True)
        
        if len(_image_rects) > 0:
            _image_bounding_box, _ = _image_rects[0]
            _image_bounding_boxes.append(_image_bounding_box)

    return _image_bounding_boxes

Please help.


Solution

  • Here is the solution / code by using that you can get the position of the image (x0, y0, x1, y1) from the PDF document

    import fitz  # PyMuPDF
    import pytesseract
    from PIL import Image
    import io
    import os
    
    def extract_images(pdf_path):
        # Open the PDF document using PyMuPDF (fitz)
        pdf_document = fitz.open(pdf_path)
        # Initialize a list to store data for each page
        pages_data = []
    
        # Iterate through each page in the PDF document
        for page_num in range(len(pdf_document)):
            # Load the current page
            page = pdf_document.load_page(page_num)
            # Get a list of images on the current page (with full details)
            images = page.get_images(full=True)
    
            # Initialize a dictionary to store image data for the current page
            page_data = {'images': []}
            
            # Iterate through each image on the page
            for img_index, img in enumerate(images):
                # Get the reference number (xref) of the image
                xref = img[0]
                # Extract the image using its reference number
                base_image = pdf_document.extract_image(xref)
    
                # Get the image bytes
                image_bytes = base_image["image"]
    
                # Create a PIL image from the image bytes
                image = Image.open(io.BytesIO(image_bytes))
    
                # Get the rectangle coordinates of the image on the page
                img_rect = page.get_image_rects(xref)[0]
                coords = (img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1)
    
                # Append the image and its coordinates to the page data
                page_data['images'].append({'image': image, 'coords': coords})
    
            # Append the current page's data to the list of pages data
            pages_data.append(page_data)
    
        # Return the list containing data of all pages
        return pages_data
    

    For above code I am using below package and python version:

    1. Python==3.12.3
    2. PyMuPDF==1.24.5
    3. plivo==4.33.0
    4. pytesseract==0.3.10