Extract White Input Boxes from pdf

Some of the Adobe XFA form fields are missing when the /PageItemUIDToLocationDataMap is extracted from some pdf files as shown on the image below where only fields identified with black dots for pages 1 and 3 (Click image to open pdf) are shown. How can the missing XFA form fields be extracted without using commercial software?

The following code was used to extract input.pdf datamaps, save in a csv file and add the points to output.pdf. Disable sort_and_filter to see original data in the .csv file:

import pikepdf
import fitz  # PyMuPDF
import csv

INPUT_PDF = "input.pdf"
OUTPUT_CSV = "points.csv"
OUTPUT_PDF = "output.pdf"
TARGET_KEY = "/PageItemUIDToLocationDataMap"

def extract_datamap_points(pdf_path, target_key=TARGET_KEY):
    out_rows = []
    with pikepdf.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            piece_info = page.get('/PieceInfo', None)
            if piece_info and '/InDesign' in piece_info:
                indesign = piece_info['/InDesign']
                if target_key in indesign:
                    for k, v in indesign[target_key].items():
                        try:
                            id_ = int(str(k).lstrip('/'))
                            type_val = float(v[2])
                            coords = [float(val) for val in list(v)[3:7]]
                            out_rows.append([i+1, id_, type_val] + coords)
                        except Exception as e:
                            print(f"Error parsing {k}:{v} ({e})")
    return out_rows

def get_pdf_page_count(pdf_path):
    with pikepdf.open(pdf_path) as pdf:
        return len(pdf.pages)

def process_rows(rows, max_pdf_pages):
    Y_TRANSFORM_BASE = 420.945  # Local constant hack for y-coordinate transform

    # Datamaps are read sequentially so hack to pages
    total_pages = get_pdf_page_count(INPUT_PDF)
    hack_page = lambda page: 2 if (page >= max_pdf_pages) else (page + 1 if page > 1 else page)
    processed_rows = []
    for row in rows:
        page, id_, type_val, x1, y1, x2, y2 = row
        hacked_page = hack_page(page)
        new_y1 = round(Y_TRANSFORM_BASE - y1, 3)
        new_y2 = round(Y_TRANSFORM_BASE - y2, 3)
        new_x1 = round(x1, 3)
        new_x2 = round(x2, 3)
        h = round(abs(new_y2 - new_y1), 1)
        processed_rows.append([hacked_page, id_, type_val, new_x1, new_y1, new_x2, new_y2, h])
    return processed_rows

def sort_and_filter(rows):
    # Sort by page ascending, -y2 descending, x1 ascending, id ascending
    rows_sorted = sorted(rows, key=lambda r: (r[0], -r[6], r[3], r[1]))
    # Filter rows
    filtered = []
    for row in rows_sorted:
        if (row[2] == 4                 # type
            and row[7] == 17):          # height
            filtered.append(row)
    return filtered

def write_csv(csv_filename, rows):
    with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['page', 'id', 'type', 'x1', 'y1', 'x2', 'y2', 'h'])
        writer.writerows(rows)

def mark_points_on_pdf(input_pdf, output_pdf, rows):
    doc = fitz.open(input_pdf)
    for row in rows:
        page_num = int(row[0])
        cx = row[3]
        cy = row[6]
        page = doc[page_num - 1]
        pymupdf_y = page.rect.height - cy
        page.draw_circle((cx, pymupdf_y), radius=2, color=(0, 0, 0), fill=(0, 0, 0))
    doc.save(output_pdf)

if __name__ == "__main__":
    points = extract_datamap_points(INPUT_PDF)
    processed_points = process_rows(points, total_pages)
    filtered_points = sort_and_filter(processed_points)
    write_csv(OUTPUT_CSV, filtered_points)
    mark_points_on_pdf(INPUT_PDF, OUTPUT_PDF, filtered_points)
    print(f"Done. Points: {len(filtered_points)}; Wrote {OUTPUT_CSV} and {OUTPUT_PDF}")

PDFtk:

Uncompress PDF page streams for editing the PDF in a text editor (e.g., vim, emacs)

pdftk doc.pdf output doc.unc.pdf uncompress

Gives: input_uncompressed.pdf

Given the comments this code extracts text blocks and adds points to the pdf as shown below the code:

import fitz  # PyMuPDF

INPUT_PDF = "input.pdf"
OUTPUT_PDF = "output_bb.pdf"

doc = fitz.open(INPUT_PDF)

for page_num in range(len(doc)):
    page = doc[page_num]
    blocks = page.get_text("blocks")
    for block in blocks:
        x0, y0, x1, y1, text, block_no = block[:6]
        # Mark the lower-left corner (x0, y1)
        cx, cy = x0, y1
        shape = page.new_shape()
        # Draw a blue filled circle (RGB: (0, 0, 1)), radius 2
        shape.draw_circle((cx, cy), 2)
        shape.finish(color=(0, 0, 1), fill=(0, 0, 1))
        shape.commit()
        # Label with block number (remove if not wanted)
        page.insert_text((cx + 5, cy - 5), str(block_no), fontname="helv", fontsize=8, color=(0, 0, 1))
        
doc.save(OUTPUT_PDF)
print(f"Done. Saved {OUTPUT_PDF}")

(Click image to open 20 page pdf).

Solution

If the problem is restated as finding the white input boxes in the pdf (credit to comment from K J) it can be solved fairly simply:

import fitz  # PyMuPDF
import csv

INPUT_PDF = "input.pdf"
OUTPUT_PDF = "output.pdf"
OUTPUT_CSV = "output.csv"


def colour_match(color, target_color=(1, 1, 1)):
    """Return True if color exactly matches target_color, else False."""
    return color == target_color


doc = fitz.open(INPUT_PDF)

# Page numbers are zero based
# pages_to_mark = list(range(len(doc)))  # Default filter for all pages
pages_to_mark = [1]  # Example: only process page 2

with open(OUTPUT_CSV, mode="w", newline="", encoding="utf-8") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["page_num", "x0", "y0", "x1", "y1"])
    for page_num in pages_to_mark:
        page = doc[page_num]
        drawings = page.get_drawings()
        shape = page.new_shape()
        for d in drawings:
            rect = d.get("rect")
            fill_color = d.get("fill")
            if rect and colour_match(fill_color, target_color=(1, 1, 1)):
                x0, y0, x1, y1 = rect
                cx, cy = x0, y1  # Lower-left corner for circle
                # Draw circle on PDF page
                shape.draw_circle((cx, cy), 2)  # Radius = 2 points
                # Write full rect coords and page number to CSV
                csvwriter.writerow([page_num, x0, y0, x1, y1])
        shape.finish(color=(0, 0, 1), fill=None)  # Blue stroke circle, no fill
        shape.commit()

doc.save(OUTPUT_PDF)
doc.close()

The following image demonstrates the solution by showing character boxes on page 2 which were not previously returned: