python-3.xpython-tesseractpymupdf

i am building code to extact text from image if the pdf has images inside it "pytesseract" and "PyMuPDF"


below code is giving me error : Error processing image: 'dict' object has no attribute 'width', below code is read pdf images from pdf and if pdf has image it should extract text from that image and check for provided list of keywords (method is not added here just for saving some effort )

import os
import pytesseract
import fitz  # PyMuPDF
from PIL import Image

def extract_text_from_image(image):
    try:
**        image = Image.frombytes("RGB", (image, image.height), image.samples) **
        image = image.convert("L")  # Convert to grayscale
        extracted_text = pytesseract.image_to_string(image, config='--oem 3 --psm 6')
        return extracted_text
    except Exception as e:
        # Log the error and continue processing
        print(f"Error processing image: {str(e)}")
        return ""

def process_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        total_pages = doc.page_count
        for i in range(total_pages):
            page = doc[i]
            image_list = page.get_images(full=True)
            if image_list:
                for img in image_list:
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    extracted_text = extract_text_from_image(base_image)
                    # Process the extracted text (e.g., check for keywords)
                    # Log relevant information
                    print(f"Page {i + 1}/{total_pages}: Extracted text: {extracted_text}")
            else:
                print(f"Page {i + 1}/{total_pages}: No images found")
            # Show progress to the user
            progress_percent = (i + 1) / total_pages * 100
            print(f"Processing progress: [{'#' * int(progress_percent / 2):50s}] {progress_percent:.2f}%")
        # Check file size
        file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
        if file_size_mb > 10:  # Adjust the threshold as needed
            print(f"File size ({file_size_mb:.2f} MB) exceeds threshold. Moving to the next file.")
    except Exception as e:
        # Log the error and continue processing other files
        print(f"Error processing PDF {pdf_path}: {str(e)}")


process_pdf(r"your_pdf.pdf")

Solution

  • import fitz  # PyMuPDF - nothing else is needed
    doc = fitz.open("input.pdf")
    
    for page in doc:
        for item in page.get_images():
            xref = item[0]
            pix = fitz.Pixmap(doc, xref)  # make Pixmap from image
            # OCR the image, make a 1-page PDF from it
            pdfdata = pix.pdfocr_tobytes()  # 1-page PDF in memory
            ocrpdf = fitz.open("pdf", pdfdata)  # open as PDF document
            ocrtext = ocrpdf[0].get_text()  # extract OCR-ed text from page 1
            # ... do something with the text
            # note: text details / metadata / positions etc. are available too
    

    In a similar way, you can OCR any image, e.g. present in a file, using its filename:

    1. Make a pixmap from the image file: pix = fitz.Pixmap("image.file").
    2. Make a 1-page PDF with embedded OCRed text in memory: pdfdata = pix.pdfocr_tobytes() # returns a bytes object
    3. Make a PDF document from these bytes: doc = fitz.open("pdf", pdfdata).
    4. Load first (and only) page of that document: page = doc[0].
    5. Extract text as usual: text = page.get_text().

    Note: I am a maintainer and the original creator of PyMuPDF.