below code is giving me error : Error processing image: 'dict' object has no attribute 'width', below code is read pdf images from pdf and if pdf has image it should extract text from that image and check for provided list of keywords (method is not added here just for saving some effort )
import os
import pytesseract
import fitz # PyMuPDF
from PIL import Image
def extract_text_from_image(image):
try:
** image = Image.frombytes("RGB", (image, image.height), image.samples) **
image = image.convert("L") # Convert to grayscale
extracted_text = pytesseract.image_to_string(image, config='--oem 3 --psm 6')
return extracted_text
except Exception as e:
# Log the error and continue processing
print(f"Error processing image: {str(e)}")
return ""
def process_pdf(pdf_path):
try:
doc = fitz.open(pdf_path)
total_pages = doc.page_count
for i in range(total_pages):
page = doc[i]
image_list = page.get_images(full=True)
if image_list:
for img in image_list:
xref = img[0]
base_image = doc.extract_image(xref)
extracted_text = extract_text_from_image(base_image)
# Process the extracted text (e.g., check for keywords)
# Log relevant information
print(f"Page {i + 1}/{total_pages}: Extracted text: {extracted_text}")
else:
print(f"Page {i + 1}/{total_pages}: No images found")
# Show progress to the user
progress_percent = (i + 1) / total_pages * 100
print(f"Processing progress: [{'#' * int(progress_percent / 2):50s}] {progress_percent:.2f}%")
# Check file size
file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
if file_size_mb > 10: # Adjust the threshold as needed
print(f"File size ({file_size_mb:.2f} MB) exceeds threshold. Moving to the next file.")
except Exception as e:
# Log the error and continue processing other files
print(f"Error processing PDF {pdf_path}: {str(e)}")
process_pdf(r"your_pdf.pdf")
import fitz # PyMuPDF - nothing else is needed
doc = fitz.open("input.pdf")
for page in doc:
for item in page.get_images():
xref = item[0]
pix = fitz.Pixmap(doc, xref) # make Pixmap from image
# OCR the image, make a 1-page PDF from it
pdfdata = pix.pdfocr_tobytes() # 1-page PDF in memory
ocrpdf = fitz.open("pdf", pdfdata) # open as PDF document
ocrtext = ocrpdf[0].get_text() # extract OCR-ed text from page 1
# ... do something with the text
# note: text details / metadata / positions etc. are available too
In a similar way, you can OCR any image, e.g. present in a file, using its filename:
pix = fitz.Pixmap("image.file")
.pdfdata = pix.pdfocr_tobytes() # returns a bytes object
doc = fitz.open("pdf", pdfdata)
.page = doc[0]
.text = page.get_text()
.Note: I am a maintainer and the original creator of PyMuPDF.