I am trying to parse an unstructured PDF file and extract information about some input fields like radiobuttons, but I am not sure how to do that.
I tried using get_fields
from PyPDF2, it does not return anything because of the nature of the PDF.
When I use extract_text()
it just gives "YES" or "NO" for the radiobutton components. Also when i print the full extracted text, there's often random spaces in between the words.Is there any method of accurately parsing unstructured PDF files?
I tried the way @jaco-kemp, but its not working. However, here one solution and works good for the pdf which you shared with me. But I am not sure is it going to work with others as it is little dirty :)) :
import fitz
import cv2
import numpy as np
def extract_radio_button_info(pdf_path):
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
print('-' * 100)
print('Page number:', page_num)
page = doc[page_num]
text = page.get_text("text")
lines = text.splitlines()
for i, line in enumerate(lines):
if "SOUR SERVICE (PER NACE MR-01-75)" in line:
text_instances = page.search_for("SOUR SERVICE (PER NACE MR-01-75)")
if text_instances:
print(i, line)
for ii, text_rect in enumerate(text_instances):
print('Y coordinate:', text_rect.y1)
# Convert PDF page to image
pix = page.get_pixmap(alpha=False)
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
# Find radio button coordinates
radio_button_x = text_rect.x1 + 50 # Adjust the offset as needed
radio_button_y = text_rect.y1
# Extract the radio button regions from the image
yes_button_region = img[int(radio_button_y - 7):int(radio_button_y + 1),
int(radio_button_x + 20):int(radio_button_x + 30)]
no_button_region = img[int(radio_button_y - 7):int(radio_button_y + 1),
int(radio_button_x + 75):int(radio_button_x + 85)]
# Save the radio button regions for visualization (optional)
cv2.imwrite(f'yes_button_region.png', yes_button_region)
cv2.imwrite(f'no_button_region.png', no_button_region)
# Calculate the mean pixel intensity for each region
yes_mean_intensity = cv2.mean(yes_button_region)[0]
no_mean_intensity = cv2.mean(no_button_region)[0]
# Determine the selected option based on the darker region
if yes_mean_intensity < no_mean_intensity:
selected_option = "YES"
else:
selected_option = "NO"
print(f"Selected option: {selected_option}")
if __name__ == "__main__":
pdf_path = "data/sample.pdf"
extract_radio_button_info(pdf_path)