How to extract table from PDF with boxes into pandas dataframe

I have code that detects a table in a PDF that appears after a specific section, and parses the information in the table and copies it into a pandas dataframe.

Now, I want to indicate whether a box is checked (or not blank) next to the information parsed from the table.

Here is a link to the PDF

Here is my code so far which can't quite seem to identify whether a box is marked or not.

import pandas as pd
import re
import fitz 
from math import sqrt
from io import BytesIO

PDF_FILE_NAME = "path/test_doc.pdf"
SECTION_HEADER = "Section 3: Table"

# --- Helper Functions (Re-using the reliable text extraction) ---

def clean_item_text(text):
    """Removes leading symbols and cleans up whitespace."""
    if pd.isna(text) or text == "":
        return ""
    # Pattern to find known symbols: ☑, ☐, □, ■, X, x, ✓, followed by optional space
    cleaned = re.sub(r"[\u2611\u2610\u25A1\u25A0Xx\u2713]\s*", "", str(text).strip())
    return cleaned.strip()

def extract_table_text(pdf_path, section_header):
    """
    Extracts the table data, but cleans the item text to get only the name.
    """
    with fitz.open(pdf_path) as doc:
        text_pages = [page.get_text("text") for page in doc]
        full_text = "".join(text_pages)
    full_text = full_text.replace("Sec$on", "Section")

    section_match = re.search(rf"{re.escape(section_header)}", full_text, re.IGNORECASE)
    if not section_match:
        raise ValueError(f"Section '{section_header}' not found.")
    section_start = section_match.end()
    text_after_section = full_text[section_start:].strip()
    table_text = re.split(r"Section\s*\d+\s*:", text_after_section, maxsplit=1)[0]

    lines = [l.strip() for l in table_text.split("\n") if l.strip()]
    
    if len(lines) < 6:
        raise ValueError("Insufficient lines found for table structure.")

    headers = [l.strip('"').strip() for l in lines[2:5]]
    items_raw = lines[5:]

    # Define column splits based on the provided data structure
    col1_raw, col2_raw, col3_raw = items_raw[0:3], items_raw[3:9], items_raw[9:15]

    # Process raw lists to get cleaned text for the DF
    col1 = [clean_item_text(x) for x in col1_raw]
    col2 = [clean_item_text(x) for x in col2_raw]
    col3 = [clean_item_text(x) for x in col3_raw]
    
    maxlen = max(len(col1), len(col2), len(col3))
    for c in (col1, col2, col3):
        while len(c) < maxlen:
            c.append("")
    
    df = pd.DataFrame({
        headers[0]: col1,
        headers[1]: col2,
        headers[2]: col3
    })
    
    # Return both the DataFrame and the list of headers
    return df, headers

# --- OCR/Image Analysis Logic ---

def scan_checkbox_roi(pdf_path, df, all_headers):
    """
    Scans an image region (ROI) to the left of each item name to detect a mark.
    """
    mapping = {}
    
    # Flatten all items in the DataFrame to a list of unique names (and filter blanks)
    all_items = [item for col in all_headers for item in df[col].dropna().tolist() if item != ""]
    all_items = list(set(all_items))
    
    print("="*60)
    print("IMAGE SCAN (OCR) ATTEMPT")
    print("="*60)

    with fitz.open(pdf_path) as doc:
        for page_num, page in enumerate(doc, 1):
            
            # Find coordinates of all relevant items on the page
            words = page.get_text("words")
            
            # Map item name to its bounding box (bbox)
            item_coords = {}
            for word in words:
                text = clean_item_text(word[4])
                if text in all_items and text not in item_coords:
                    item_coords[text] = word[:4] # (x0, y0, x1, y1)
            
            # Process each found item
            for item_text, item_bbox in item_coords.items():
                
                # Define ROI: A small rectangle to the left of the item text.
                # x0 = item_bbox[0] - 25, y0 = item_bbox[1] - 5
                # x1 = item_bbox[0] - 5, y1 = item_bbox[3] + 5
                roi_rect = fitz.Rect(item_bbox[0] - 25, item_bbox[1] - 5, 
                                     item_bbox[0] - 5, item_bbox[3] + 5)
                
                if not roi_rect.is_empty:
                    # 1. Render the ROI to a Pixmap (Image) at high resolution
                    matrix = fitz.Matrix(3, 3)
                    pix = page.get_pixmap(matrix=matrix, clip=roi_rect)
                    
                    # 2. Analyze Pixels for a Mark
                    dark_pixel_threshold = 0.9 # 90% white threshold
                    dark_pixel_count = 0
                    total_pixels = pix.width * pix.height

                    for i in range(0, len(pix.samples), pix.n):
                        r, g, b = pix.samples[i:i+3]
                        # Convert RGB to grayscale (luminance)
                        luminance = (0.2126 * r + 0.7152 * g + 0.0722 * b) / 255.0
                        
                        if luminance < dark_pixel_threshold:
                            dark_pixel_count += 1

                    # 3. Determine Status
                    mark_ratio = dark_pixel_count / total_pixels
                    
                    if mark_ratio > 0.05: # If more than 5% of pixels are dark (mark detected)
                        status = "checked"
                    else:
                        status = "unchecked"
                    
                    mapping[item_text] = status
                    print(f"  ✓ '{item_text}' (Ratio: {mark_ratio:.3f}) -> {status}")
                else:
                    mapping[item_text] = ""
                    print(f"  ✗ '{item_text}' - Invalid ROI")

    return mapping

# --- Main Logic ---

def parse_pdf_for_table_with_checkboxes(pdf_file_path, section_header):
    # 1. Extract the clean item names and original headers
    df, original_data_cols = extract_table_text(pdf_file_path, section_header)
    
    # 2. Use the item names to guide the image scanning for status
    checkbox_map = scan_checkbox_roi(pdf_file_path, df, original_data_cols)
    
    # 3. Apply status to dataframe (FIXED LOGIC)
    
    # Ensure we only iterate over the original columns before adding new ones
    
    for col in original_data_cols:
        status_col = f"{col} Status"
        def get_status(x):
            if pd.isna(x) or x == "":
                return ""
            val = str(x).strip()
            return checkbox_map.get(val, "")
        
        df[status_col] = df[col].map(get_status)
        
    # Re-order columns using the clean, original column list
    new_cols = []
    for h in original_data_cols:
        new_cols.append(h)
        new_cols.append(f"{h} Status")
    
    return df[new_cols]

# Run
result = parse_pdf_for_table_with_checkboxes(PDF_FILE_NAME, SECTION_HEADER)

The final dataframe should look like this:

Col1 Col1_Status Col2 Col2_Status Col3 Col3_Status
Item1 Checked Item4 Checked Item10 Checked
Item2         Item5         Item11
Item3         Item6         Item12
              Item7 Checked Item13 Checked
              Item8         Item14
              Item9         Item15 Checked

But the columns are a little misaligned and none of the Xs in the boxes are being detected.

How do I solve this problem?

Solution

As mentioned in the comments you can make your life much easier by working with pymupdf.Page.find_tables. This method has many arguments that can be used to get identify the table properly or that can be used to extract useful information for getting the entries.

In the example of pdf given in the OP the table has no horizontal lines for separating values and this influence the detection of the column entries (in this case are \n-separated) so a manual tuning is required.

import itertools as it
import fitz  
import numpy as np
import pandas as pd


CHECKED_STATUS_MSG = " [OK]"
DF_FILLVALUE = ''

path = "your path.pdf"

with fitz.open(path) as doc:
    page = doc[0]

    # get table as df
    #################
    tf = page.find_tables()    
    for tb in tf.tables:
        t = []
        for row in tb.extract():
            r = [entry.splitlines() for entry in row if entry]
            if r:
                t.append(r)

        # header
        header = []
        header.extend(zip(*t[0]))
        header.extend(zip(*t[1]))
        header = list(zip(*header))
        header = pd.MultiIndex.from_tuples(header)

        # df entries
        data_cols = list(it.zip_longest(*t[2:].pop(), fillvalue=DF_FILLVALUE))
        df = pd.DataFrame(data_cols)
        

    # get check-status
    ################## 
    n_cols = set()
    n_rows = set()
    rs = []
    for t, *r in page.get_bboxlog(): # https://pymupdf.readthedocs.io/en/latest/functions.html#Page.get_bboxlog
        if t == 'stroke-path':
            r = fitz.IRect(*r)
            a = f"{r.get_area():.1f}"

            # you need to do some pre-processing to get the "right" values
            if a in {"196.0", "210.0"}:
                w, h = r.width/3, r.height/3
                sub_r = fitz.Rect(r.x0+w, r.y0+h, r.x1-w, r.y1-h)

                pix = page.get_pixmap(colorspace="gray", clip=sub_r, annots=False)
                is_r_checked = pix.is_unicolor is False

                n_cols.add(r.y0)
                n_rows.add(r.x0)
                                
                rs.append(((r.y0, r.x0), is_r_checked))

    rs.sort()
    n_cols = sorted(n_cols)
    n_rows = sorted(n_rows)
    rs = [((n_rows.index(y), n_cols.index(x)), v) for (x, y), v in rs]
    d = dict(rs)

    # final df
    ##########
    df_checked = pd.Series(d).unstack(level=1).T

    mask = df_checked.fillna(False).astype(bool)
    
    p = df + np.where(mask, CHECKED_STATUS_MSG, "")
    pd_f = pd.DataFrame(p)
    pd_f.columns=header
    
    print(pd_f)

Output

             I           II           III
      Column 1     Column 2      Column 3
0  Item 1 [OK]  Item 4 [OK]  Item 10 [OK]
1       Item 2       Item 5       Item 11
2       Item 3       Item 6       Item 12
3               Item 7 [OK]  Item 13 [OK]
4                    Item 8       Item 14
5                    Item 9  Item 15 [OK]

If you don't want None for the missing entries you can specify a default value with

default_value = ''
it.zip_longest(<iterable>, fillvalue=default_value)