Some of the Adobe XFA form fields are missing when the /PageItemUIDToLocationDataMap is extracted from some pdf files as shown on the image below where only fields identified with black dots for pages 1 and 3 (Click image to open pdf) are shown. How can the missing XFA form fields be extracted without using commercial software?
The following code was used to extract input.pdf datamaps, save in a csv file and add the points to output.pdf. Disable sort_and_filter to see original data in the .csv file:
import pikepdf
import fitz # PyMuPDF
import csv
INPUT_PDF = "input.pdf"
OUTPUT_CSV = "points.csv"
OUTPUT_PDF = "output.pdf"
TARGET_KEY = "/PageItemUIDToLocationDataMap"
def extract_datamap_points(pdf_path, target_key=TARGET_KEY):
out_rows = []
with pikepdf.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
piece_info = page.get('/PieceInfo', None)
if piece_info and '/InDesign' in piece_info:
indesign = piece_info['/InDesign']
if target_key in indesign:
for k, v in indesign[target_key].items():
try:
id_ = int(str(k).lstrip('/'))
type_val = float(v[2])
coords = [float(val) for val in list(v)[3:7]]
out_rows.append([i+1, id_, type_val] + coords)
except Exception as e:
print(f"Error parsing {k}:{v} ({e})")
return out_rows
def get_pdf_page_count(pdf_path):
with pikepdf.open(pdf_path) as pdf:
return len(pdf.pages)
def process_rows(rows, max_pdf_pages):
Y_TRANSFORM_BASE = 420.945 # Local constant hack for y-coordinate transform
# Datamaps are read sequentially so hack to pages
total_pages = get_pdf_page_count(INPUT_PDF)
hack_page = lambda page: 2 if (page >= max_pdf_pages) else (page + 1 if page > 1 else page)
processed_rows = []
for row in rows:
page, id_, type_val, x1, y1, x2, y2 = row
hacked_page = hack_page(page)
new_y1 = round(Y_TRANSFORM_BASE - y1, 3)
new_y2 = round(Y_TRANSFORM_BASE - y2, 3)
new_x1 = round(x1, 3)
new_x2 = round(x2, 3)
h = round(abs(new_y2 - new_y1), 1)
processed_rows.append([hacked_page, id_, type_val, new_x1, new_y1, new_x2, new_y2, h])
return processed_rows
def sort_and_filter(rows):
# Sort by page ascending, -y2 descending, x1 ascending, id ascending
rows_sorted = sorted(rows, key=lambda r: (r[0], -r[6], r[3], r[1]))
# Filter rows
filtered = []
for row in rows_sorted:
if (row[2] == 4 # type
and row[7] == 17): # height
filtered.append(row)
return filtered
def write_csv(csv_filename, rows):
with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['page', 'id', 'type', 'x1', 'y1', 'x2', 'y2', 'h'])
writer.writerows(rows)
def mark_points_on_pdf(input_pdf, output_pdf, rows):
doc = fitz.open(input_pdf)
for row in rows:
page_num = int(row[0])
cx = row[3]
cy = row[6]
page = doc[page_num - 1]
pymupdf_y = page.rect.height - cy
page.draw_circle((cx, pymupdf_y), radius=2, color=(0, 0, 0), fill=(0, 0, 0))
doc.save(output_pdf)
if __name__ == "__main__":
points = extract_datamap_points(INPUT_PDF)
processed_points = process_rows(points, total_pages)
filtered_points = sort_and_filter(processed_points)
write_csv(OUTPUT_CSV, filtered_points)
mark_points_on_pdf(INPUT_PDF, OUTPUT_PDF, filtered_points)
print(f"Done. Points: {len(filtered_points)}; Wrote {OUTPUT_CSV} and {OUTPUT_PDF}")
Uncompress PDF page streams for editing the PDF in a text editor (e.g., vim, emacs)
- pdftk doc.pdf output doc.unc.pdf uncompress
Gives: input_uncompressed.pdf
Given the comments this code extracts text blocks and adds points to the pdf as shown below the code:
import fitz # PyMuPDF
INPUT_PDF = "input.pdf"
OUTPUT_PDF = "output_bb.pdf"
doc = fitz.open(INPUT_PDF)
for page_num in range(len(doc)):
page = doc[page_num]
blocks = page.get_text("blocks")
for block in blocks:
x0, y0, x1, y1, text, block_no = block[:6]
# Mark the lower-left corner (x0, y1)
cx, cy = x0, y1
shape = page.new_shape()
# Draw a blue filled circle (RGB: (0, 0, 1)), radius 2
shape.draw_circle((cx, cy), 2)
shape.finish(color=(0, 0, 1), fill=(0, 0, 1))
shape.commit()
# Label with block number (remove if not wanted)
page.insert_text((cx + 5, cy - 5), str(block_no), fontname="helv", fontsize=8, color=(0, 0, 1))
doc.save(OUTPUT_PDF)
print(f"Done. Saved {OUTPUT_PDF}")
(Click image to open 20 page pdf).
If the problem is restated as finding the white input boxes in the pdf (credit to comment from K J) it can be solved fairly simply:
import fitz # PyMuPDF
import csv
INPUT_PDF = "input.pdf"
OUTPUT_PDF = "output.pdf"
OUTPUT_CSV = "output.csv"
def colour_match(color, target_color=(1, 1, 1)):
"""Return True if color exactly matches target_color, else False."""
return color == target_color
doc = fitz.open(INPUT_PDF)
# Page numbers are zero based
# pages_to_mark = list(range(len(doc))) # Default filter for all pages
pages_to_mark = [1] # Example: only process page 2
with open(OUTPUT_CSV, mode="w", newline="", encoding="utf-8") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["page_num", "x0", "y0", "x1", "y1"])
for page_num in pages_to_mark:
page = doc[page_num]
drawings = page.get_drawings()
shape = page.new_shape()
for d in drawings:
rect = d.get("rect")
fill_color = d.get("fill")
if rect and colour_match(fill_color, target_color=(1, 1, 1)):
x0, y0, x1, y1 = rect
cx, cy = x0, y1 # Lower-left corner for circle
# Draw circle on PDF page
shape.draw_circle((cx, cy), 2) # Radius = 2 points
# Write full rect coords and page number to CSV
csvwriter.writerow([page_num, x0, y0, x1, y1])
shape.finish(color=(0, 0, 1), fill=None) # Blue stroke circle, no fill
shape.commit()
doc.save(OUTPUT_PDF)
doc.close()
The following image demonstrates the solution by showing character boxes on page 2 which were not previously returned: