pythonpdfpymupdfpikepdf

Extract White Input Boxes from pdf


Some of the Adobe XFA form fields are missing when the /PageItemUIDToLocationDataMap is extracted from some pdf files as shown on the image below where only fields identified with black dots for pages 1 and 3 (Click image to open pdf) are shown. How can the missing XFA form fields be extracted without using commercial software?

Sample pdf document output

The following code was used to extract input.pdf datamaps, save in a csv file and add the points to output.pdf. Disable sort_and_filter to see original data in the .csv file:

import pikepdf
import fitz  # PyMuPDF
import csv

INPUT_PDF = "input.pdf"
OUTPUT_CSV = "points.csv"
OUTPUT_PDF = "output.pdf"
TARGET_KEY = "/PageItemUIDToLocationDataMap"

def extract_datamap_points(pdf_path, target_key=TARGET_KEY):
    out_rows = []
    with pikepdf.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            piece_info = page.get('/PieceInfo', None)
            if piece_info and '/InDesign' in piece_info:
                indesign = piece_info['/InDesign']
                if target_key in indesign:
                    for k, v in indesign[target_key].items():
                        try:
                            id_ = int(str(k).lstrip('/'))
                            type_val = float(v[2])
                            coords = [float(val) for val in list(v)[3:7]]
                            out_rows.append([i+1, id_, type_val] + coords)
                        except Exception as e:
                            print(f"Error parsing {k}:{v} ({e})")
    return out_rows

def get_pdf_page_count(pdf_path):
    with pikepdf.open(pdf_path) as pdf:
        return len(pdf.pages)

def process_rows(rows, max_pdf_pages):
    Y_TRANSFORM_BASE = 420.945  # Local constant hack for y-coordinate transform

    # Datamaps are read sequentially so hack to pages
    total_pages = get_pdf_page_count(INPUT_PDF)
    hack_page = lambda page: 2 if (page >= max_pdf_pages) else (page + 1 if page > 1 else page)
    processed_rows = []
    for row in rows:
        page, id_, type_val, x1, y1, x2, y2 = row
        hacked_page = hack_page(page)
        new_y1 = round(Y_TRANSFORM_BASE - y1, 3)
        new_y2 = round(Y_TRANSFORM_BASE - y2, 3)
        new_x1 = round(x1, 3)
        new_x2 = round(x2, 3)
        h = round(abs(new_y2 - new_y1), 1)
        processed_rows.append([hacked_page, id_, type_val, new_x1, new_y1, new_x2, new_y2, h])
    return processed_rows

def sort_and_filter(rows):
    # Sort by page ascending, -y2 descending, x1 ascending, id ascending
    rows_sorted = sorted(rows, key=lambda r: (r[0], -r[6], r[3], r[1]))
    # Filter rows
    filtered = []
    for row in rows_sorted:
        if (row[2] == 4                 # type
            and row[7] == 17):          # height
            filtered.append(row)
    return filtered

def write_csv(csv_filename, rows):
    with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['page', 'id', 'type', 'x1', 'y1', 'x2', 'y2', 'h'])
        writer.writerows(rows)

def mark_points_on_pdf(input_pdf, output_pdf, rows):
    doc = fitz.open(input_pdf)
    for row in rows:
        page_num = int(row[0])
        cx = row[3]
        cy = row[6]
        page = doc[page_num - 1]
        pymupdf_y = page.rect.height - cy
        page.draw_circle((cx, pymupdf_y), radius=2, color=(0, 0, 0), fill=(0, 0, 0))
    doc.save(output_pdf)

if __name__ == "__main__":
    points = extract_datamap_points(INPUT_PDF)
    processed_points = process_rows(points, total_pages)
    filtered_points = sort_and_filter(processed_points)
    write_csv(OUTPUT_CSV, filtered_points)
    mark_points_on_pdf(INPUT_PDF, OUTPUT_PDF, filtered_points)
    print(f"Done. Points: {len(filtered_points)}; Wrote {OUTPUT_CSV} and {OUTPUT_PDF}")

PDFtk:

Uncompress PDF page streams for editing the PDF in a text editor (e.g., vim, emacs)

  • pdftk doc.pdf output doc.unc.pdf uncompress

Gives: input_uncompressed.pdf


Given the comments this code extracts text blocks and adds points to the pdf as shown below the code:

import fitz  # PyMuPDF

INPUT_PDF = "input.pdf"
OUTPUT_PDF = "output_bb.pdf"

doc = fitz.open(INPUT_PDF)

for page_num in range(len(doc)):
    page = doc[page_num]
    blocks = page.get_text("blocks")
    for block in blocks:
        x0, y0, x1, y1, text, block_no = block[:6]
        # Mark the lower-left corner (x0, y1)
        cx, cy = x0, y1
        shape = page.new_shape()
        # Draw a blue filled circle (RGB: (0, 0, 1)), radius 2
        shape.draw_circle((cx, cy), 2)
        shape.finish(color=(0, 0, 1), fill=(0, 0, 1))
        shape.commit()
        # Label with block number (remove if not wanted)
        page.insert_text((cx + 5, cy - 5), str(block_no), fontname="helv", fontsize=8, color=(0, 0, 1))
        
doc.save(OUTPUT_PDF)
print(f"Done. Saved {OUTPUT_PDF}")

Sample pdf document output for text Box

(Click image to open 20 page pdf).


Solution

  • If the problem is restated as finding the white input boxes in the pdf (credit to comment from K J) it can be solved fairly simply:

    import fitz  # PyMuPDF
    import csv
    
    INPUT_PDF = "input.pdf"
    OUTPUT_PDF = "output.pdf"
    OUTPUT_CSV = "output.csv"
    
    
    def colour_match(color, target_color=(1, 1, 1)):
        """Return True if color exactly matches target_color, else False."""
        return color == target_color
    
    
    doc = fitz.open(INPUT_PDF)
    
    # Page numbers are zero based
    # pages_to_mark = list(range(len(doc)))  # Default filter for all pages
    pages_to_mark = [1]  # Example: only process page 2
    
    with open(OUTPUT_CSV, mode="w", newline="", encoding="utf-8") as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(["page_num", "x0", "y0", "x1", "y1"])
        for page_num in pages_to_mark:
            page = doc[page_num]
            drawings = page.get_drawings()
            shape = page.new_shape()
            for d in drawings:
                rect = d.get("rect")
                fill_color = d.get("fill")
                if rect and colour_match(fill_color, target_color=(1, 1, 1)):
                    x0, y0, x1, y1 = rect
                    cx, cy = x0, y1  # Lower-left corner for circle
                    # Draw circle on PDF page
                    shape.draw_circle((cx, cy), 2)  # Radius = 2 points
                    # Write full rect coords and page number to CSV
                    csvwriter.writerow([page_num, x0, y0, x1, y1])
            shape.finish(color=(0, 0, 1), fill=None)  # Blue stroke circle, no fill
            shape.commit()
    
    doc.save(OUTPUT_PDF)
    doc.close()
    

    The following image demonstrates the solution by showing character boxes on page 2 which were not previously returned:

    example showing circles on white boxes of page 2