I have code that detects a table in a PDF that appears after a specific section, and parses the information in the table and copies it into a pandas dataframe.
Now, I want to indicate whether a box is checked (or not blank) next to the information parsed from the table.
Here is a link to the PDF
Here is my code so far which can't quite seem to identify whether a box is marked or not.
import pandas as pd
import re
import fitz
from math import sqrt
from io import BytesIO
PDF_FILE_NAME = "path/test_doc.pdf"
SECTION_HEADER = "Section 3: Table"
# --- Helper Functions (Re-using the reliable text extraction) ---
def clean_item_text(text):
"""Removes leading symbols and cleans up whitespace."""
if pd.isna(text) or text == "":
return ""
# Pattern to find known symbols: ☑, ☐, □, ■, X, x, ✓, followed by optional space
cleaned = re.sub(r"[\u2611\u2610\u25A1\u25A0Xx\u2713]\s*", "", str(text).strip())
return cleaned.strip()
def extract_table_text(pdf_path, section_header):
"""
Extracts the table data, but cleans the item text to get only the name.
"""
with fitz.open(pdf_path) as doc:
text_pages = [page.get_text("text") for page in doc]
full_text = "".join(text_pages)
full_text = full_text.replace("Sec$on", "Section")
section_match = re.search(rf"{re.escape(section_header)}", full_text, re.IGNORECASE)
if not section_match:
raise ValueError(f"Section '{section_header}' not found.")
section_start = section_match.end()
text_after_section = full_text[section_start:].strip()
table_text = re.split(r"Section\s*\d+\s*:", text_after_section, maxsplit=1)[0]
lines = [l.strip() for l in table_text.split("\n") if l.strip()]
if len(lines) < 6:
raise ValueError("Insufficient lines found for table structure.")
headers = [l.strip('"').strip() for l in lines[2:5]]
items_raw = lines[5:]
# Define column splits based on the provided data structure
col1_raw, col2_raw, col3_raw = items_raw[0:3], items_raw[3:9], items_raw[9:15]
# Process raw lists to get cleaned text for the DF
col1 = [clean_item_text(x) for x in col1_raw]
col2 = [clean_item_text(x) for x in col2_raw]
col3 = [clean_item_text(x) for x in col3_raw]
maxlen = max(len(col1), len(col2), len(col3))
for c in (col1, col2, col3):
while len(c) < maxlen:
c.append("")
df = pd.DataFrame({
headers[0]: col1,
headers[1]: col2,
headers[2]: col3
})
# Return both the DataFrame and the list of headers
return df, headers
# --- OCR/Image Analysis Logic ---
def scan_checkbox_roi(pdf_path, df, all_headers):
"""
Scans an image region (ROI) to the left of each item name to detect a mark.
"""
mapping = {}
# Flatten all items in the DataFrame to a list of unique names (and filter blanks)
all_items = [item for col in all_headers for item in df[col].dropna().tolist() if item != ""]
all_items = list(set(all_items))
print("="*60)
print("IMAGE SCAN (OCR) ATTEMPT")
print("="*60)
with fitz.open(pdf_path) as doc:
for page_num, page in enumerate(doc, 1):
# Find coordinates of all relevant items on the page
words = page.get_text("words")
# Map item name to its bounding box (bbox)
item_coords = {}
for word in words:
text = clean_item_text(word[4])
if text in all_items and text not in item_coords:
item_coords[text] = word[:4] # (x0, y0, x1, y1)
# Process each found item
for item_text, item_bbox in item_coords.items():
# Define ROI: A small rectangle to the left of the item text.
# x0 = item_bbox[0] - 25, y0 = item_bbox[1] - 5
# x1 = item_bbox[0] - 5, y1 = item_bbox[3] + 5
roi_rect = fitz.Rect(item_bbox[0] - 25, item_bbox[1] - 5,
item_bbox[0] - 5, item_bbox[3] + 5)
if not roi_rect.is_empty:
# 1. Render the ROI to a Pixmap (Image) at high resolution
matrix = fitz.Matrix(3, 3)
pix = page.get_pixmap(matrix=matrix, clip=roi_rect)
# 2. Analyze Pixels for a Mark
dark_pixel_threshold = 0.9 # 90% white threshold
dark_pixel_count = 0
total_pixels = pix.width * pix.height
for i in range(0, len(pix.samples), pix.n):
r, g, b = pix.samples[i:i+3]
# Convert RGB to grayscale (luminance)
luminance = (0.2126 * r + 0.7152 * g + 0.0722 * b) / 255.0
if luminance < dark_pixel_threshold:
dark_pixel_count += 1
# 3. Determine Status
mark_ratio = dark_pixel_count / total_pixels
if mark_ratio > 0.05: # If more than 5% of pixels are dark (mark detected)
status = "checked"
else:
status = "unchecked"
mapping[item_text] = status
print(f" ✓ '{item_text}' (Ratio: {mark_ratio:.3f}) -> {status}")
else:
mapping[item_text] = ""
print(f" ✗ '{item_text}' - Invalid ROI")
return mapping
# --- Main Logic ---
def parse_pdf_for_table_with_checkboxes(pdf_file_path, section_header):
# 1. Extract the clean item names and original headers
df, original_data_cols = extract_table_text(pdf_file_path, section_header)
# 2. Use the item names to guide the image scanning for status
checkbox_map = scan_checkbox_roi(pdf_file_path, df, original_data_cols)
# 3. Apply status to dataframe (FIXED LOGIC)
# Ensure we only iterate over the original columns before adding new ones
for col in original_data_cols:
status_col = f"{col} Status"
def get_status(x):
if pd.isna(x) or x == "":
return ""
val = str(x).strip()
return checkbox_map.get(val, "")
df[status_col] = df[col].map(get_status)
# Re-order columns using the clean, original column list
new_cols = []
for h in original_data_cols:
new_cols.append(h)
new_cols.append(f"{h} Status")
return df[new_cols]
# Run
result = parse_pdf_for_table_with_checkboxes(PDF_FILE_NAME, SECTION_HEADER)
The final dataframe should look like this:
Col1 Col1_Status Col2 Col2_Status Col3 Col3_Status
Item1 Checked Item4 Checked Item10 Checked
Item2 Item5 Item11
Item3 Item6 Item12
Item7 Checked Item13 Checked
Item8 Item14
Item9 Item15 Checked
But the columns are a little misaligned and none of the Xs in the boxes are being detected.
How do I solve this problem?
As mentioned in the comments you can make your life much easier by working with pymupdf.Page.find_tables. This method has many arguments that can be used to get identify the table properly or that can be used to extract useful information for getting the entries.
In the example of pdf given in the OP the table has no horizontal lines for separating values and this influence the detection of the column entries (in this case are \n-separated) so a manual tuning is required.
import itertools as it
import fitz
import numpy as np
import pandas as pd
CHECKED_STATUS_MSG = " [OK]"
DF_FILLVALUE = ''
path = "your path.pdf"
with fitz.open(path) as doc:
page = doc[0]
# get table as df
#################
tf = page.find_tables()
for tb in tf.tables:
t = []
for row in tb.extract():
r = [entry.splitlines() for entry in row if entry]
if r:
t.append(r)
# header
header = []
header.extend(zip(*t[0]))
header.extend(zip(*t[1]))
header = list(zip(*header))
header = pd.MultiIndex.from_tuples(header)
# df entries
data_cols = list(it.zip_longest(*t[2:].pop(), fillvalue=DF_FILLVALUE))
df = pd.DataFrame(data_cols)
# get check-status
##################
n_cols = set()
n_rows = set()
rs = []
for t, *r in page.get_bboxlog(): # https://pymupdf.readthedocs.io/en/latest/functions.html#Page.get_bboxlog
if t == 'stroke-path':
r = fitz.IRect(*r)
a = f"{r.get_area():.1f}"
# you need to do some pre-processing to get the "right" values
if a in {"196.0", "210.0"}:
w, h = r.width/3, r.height/3
sub_r = fitz.Rect(r.x0+w, r.y0+h, r.x1-w, r.y1-h)
pix = page.get_pixmap(colorspace="gray", clip=sub_r, annots=False)
is_r_checked = pix.is_unicolor is False
n_cols.add(r.y0)
n_rows.add(r.x0)
rs.append(((r.y0, r.x0), is_r_checked))
rs.sort()
n_cols = sorted(n_cols)
n_rows = sorted(n_rows)
rs = [((n_rows.index(y), n_cols.index(x)), v) for (x, y), v in rs]
d = dict(rs)
# final df
##########
df_checked = pd.Series(d).unstack(level=1).T
mask = df_checked.fillna(False).astype(bool)
p = df + np.where(mask, CHECKED_STATUS_MSG, "")
pd_f = pd.DataFrame(p)
pd_f.columns=header
print(pd_f)
Output
I II III
Column 1 Column 2 Column 3
0 Item 1 [OK] Item 4 [OK] Item 10 [OK]
1 Item 2 Item 5 Item 11
2 Item 3 Item 6 Item 12
3 Item 7 [OK] Item 13 [OK]
4 Item 8 Item 14
5 Item 9 Item 15 [OK]
If you don't want None for the missing entries you can specify a default value with
default_value = ''
it.zip_longest(<iterable>, fillvalue=default_value)