javapdfaccessibilitypdfboxtagged-pdf

How to add the annotation tag in tagged PDF using PDFBox


I have a tagged PDF (PDF-UA compliance) with simple structure: Tagged PDF

I need to add the annotation:

    <?xml version="1.0" encoding="UTF-8"?>
    <xfdf xmlns="http://ns.adobe.com/xfdf/">
      <annots>
        <text color="#FFC333" date="D:20240425000000" flags="print,nozoom,norotate" opacity="0.600000" page="0" rect="43.086,707.79,63.086,732.79" title="ABC">
          <contents-richtext>
            <body xmlns="http://www.w3.org/1999/xhtml">
              <p dir="ltr">Sample comment</p>
            </body>
          </contents-richtext>
          <popup flags="nozoom,norotate" open="yes" page="0" rect="595.0,507.78998,790.0,632.79"/>
        </text>
      </annots>
    </xfdf>

in the nested tag Annot in the tag P. Like this (I've made in manually in the Adobe Acrobat): Expected result

I'm able to import the XFDF XML into the PDF:

    PDDocument document = PDDocument.load(new File("tagged.pdf"));
    
    FDFDocument fdfDoc = FDFDocument.loadXFDF(new File("annotation.xfdf"));
    List<FDFAnnotation> fdfAnnots = fdfDoc.getCatalog().getFDF().getAnnotations();
    
    List<PDAnnotation> pageAnnotations = new ArrayList<>();
    for (int i=0; i < fdfAnnots.size(); i++) {
        FDFAnnotation fdfannot = fdfAnnots.get(i);
        PDAnnotation pdfannot = PDAnnotation.createAnnotation(fdfannot.getCOSObject());
        pdfannot.constructAppearances();
        pageAnnotations.add(pdfannot);
    }
    document.getPage(0).setAnnotations(pageAnnotations);
    
    document.save(new File("tagged_with_annotation.pdf"));

but the veraPDF checker outputs:

An annotation, excluding annotations of subtype Widget, PrinterMark or Link, shall be nested within an Annot tag

and PDF Accessibility Checker (PAC tool) outputs:

Annotation is not nested inside an "Annot" structure element

as expected, because I didn't set the 'annot' tag (and concrete place) for imported annotation.

Also, I'm able to obtain the tag P object:

    PDDocumentCatalog pdDocumentCatalog = document.getDocumentCatalog();
    PDStructureTreeRoot pdStructureTreeRoot = pdDocumentCatalog.getStructureTreeRoot();
    COSArray aDocument = (COSArray)(pdStructureTreeRoot.getK().getCOSObject());
    COSObject oDocument = (COSObject) aDocument.get(0);
    COSArray aPart = (COSArray)oDocument.getItem(COSName.K).getCOSObject();
    COSObject oPart = (COSObject) aPart.get(0);
    COSArray aSect = (COSArray)oPart.getItem(COSName.K).getCOSObject();
    COSObject oSect = (COSObject) aSect.get(0);
    COSArray aP = (COSArray)oSect.getItem(COSName.K).getCOSObject();
    COSObject oP= (COSObject) aP.get(0);

But I don't understand how to edit the existing tags tree in the PDF.

Question: how to add the tag and annotation into the existing tags tree into PDF using PDFBox?

UPDATE: the example PDF and XFDF here.

UPDATE 2: I've updated the file tagged_with_annotation_acrobat.pdf on google drive. The previous version was incorrect - missed Annotation tag.

UPDATE 3: To select the last P, change:

COSArray aP = oSect.getCOSArray(COSName.K);

to

COSArray aH1P = oSect.getCOSArray(COSName.K);
COSDictionary oP = (COSDictionary) aH1P.getObject(1);
COSArray aP = oP.getCOSArray(COSName.K);

and for adding the annotation without outer P inside last P

    // add the annotation element
    COSDictionary anDict = new COSDictionary();
    anDict.setItem(COSName.S, COSName.ANNOT);
    anDict.setItem(COSName.P, oP);
    anDict.setItem(COSName.PG, page);
    
    PDObjectReference objRef = new PDObjectReference();
    anDict.setItem(COSName.K, objRef);
    
objRef.setReferencedObject(page.getAnnotations().get(0));
    
    aP.add(anDict);

Solution

  • Here's my attempt at this. It is very tailored to your PDF. I have placed the annotation in the array Root/StructTreeRoot/K/[0]/K/[0]/K/[0]/K.

    PDStructureTreeRoot structureTreeRoot = doc.getDocumentCatalog().getStructureTreeRoot();
    PDPage page = doc.getPage(0);
    PDAnnotation annotation = page.getAnnotations().get(0);
    
    PDStructureNode documentStructureNode = (PDStructureNode) structureTreeRoot.getKids().get(0);
    PDStructureNode partStructureNode = (PDStructureNode) documentStructureNode.getKids().get(0);
    PDStructureNode sectStructureNode = (PDStructureNode) partStructureNode.getKids().get(0);
    
    // add a dummy intermediate /P element
    PDStructureElement pStructureElement = new PDStructureElement("P", sectStructureNode);
    sectStructureNode.appendKid(pStructureElement);
    
    // add the annotation element
    PDStructureElement annotStructureElement = new PDStructureElement("Annot", pStructureElement);
    annotStructureElement.setPage(page);
    pStructureElement.appendKid(annotStructureElement);
    // see "Entries in an object reference dictionary"
    PDObjectReference objRef = new PDObjectReference();
    objRef.setReferencedObject(annotation);
    annotStructureElement.appendKid(objRef);
    
    int parentTreeNextKey = structureTreeRoot.getParentTreeNextKey();
    PDNumberTreeNode parentTree = structureTreeRoot.getParentTree();
    Map<Integer, COSObjectable> numberTreeAsMap = getNumberTreeAsMap(parentTree);
    Set<Integer> keySet = numberTreeAsMap.keySet();
    if (parentTreeNextKey == -1)
    {
        parentTreeNextKey = keySet.stream().reduce(Integer::max).get() + 1;
    }
    int annotKey = annotation.getStructParent();
    if (annotKey == -1)
    {
        annotKey = parentTreeNextKey + 1;
        annotation.setStructParent(annotKey);
        structureTreeRoot.setParentTreeNextKey(annotKey + 1);
    }
    numberTreeAsMap.put(annotKey, annotStructureElement);
    parentTree = new PDNumberTreeNode(PDParentTreeValue.class);
    parentTree.setNumbers(numberTreeAsMap);
    structureTreeRoot.setParentTree(parentTree);
    

    Previous version that uses more of the COS API:

    PDStructureTreeRoot structureTreeRoot = doc.getDocumentCatalog().getStructureTreeRoot();
    PDPage page = doc.getPage(0);
    PDAnnotation annotation = page.getAnnotations().get(0);
    
    COSArray aDocument = (COSArray) structureTreeRoot.getK();
    COSDictionary oDocument = (COSDictionary) aDocument.getObject(0);
    COSArray aPart = oDocument.getCOSArray(COSName.K);
    COSDictionary oPart = (COSDictionary) aPart.getObject(0);
    COSArray aSect = oPart.getCOSArray(COSName.K);
    COSDictionary oSect = (COSDictionary) aSect.getObject(0);
    COSArray aP = oSect.getCOSArray(COSName.K);
    
    // add a dummy /P element
    // (it later turned out that this wasn't really needed but doesn't hurt either,
    // but it was like that in the "good" file I used for inspiration.
    // I can remove it if you want)
    COSDictionary pDict = new COSDictionary(); // PDStructureElement
    pDict.setItem(COSName.S, COSName.P);
    pDict.setItem(COSName.P, aSect.get(0)); // assign aP but as ref
    
    // add the annotation element
    COSDictionary anDict = new COSDictionary(); // PDStructureElement
    pDict.setItem(COSName.K, anDict);
    anDict.setItem(COSName.S, COSName.ANNOT);
    anDict.setItem(COSName.P, pDict);
    anDict.setItem(COSName.PG, page);
    // see "Entries in an object reference dictionary"
    PDObjectReference objRef = new PDObjectReference();
    anDict.setItem(COSName.K, objRef);
    objRef.setReferencedObject(page.getAnnotations().get(0));
    // not needed here because done in parent; however
    //TODO update PDObjectReference API accordingly
    //objRef.getCOSObject().setItem(COSName.PG, page);
    aP.add(0, pDict);
        
    int parentTreeNextKey = structureTreeRoot.getParentTreeNextKey();
    PDNumberTreeNode parentTree = structureTreeRoot.getParentTree();
    Map<Integer, COSObjectable> numberTreeAsMap = getNumberTreeAsMap(parentTree);
    Set<Integer> keySet = numberTreeAsMap.keySet();
    if (parentTreeNextKey == -1)
    {
        parentTreeNextKey = keySet.stream().reduce(Integer::max).get() + 1;
    }
    int annotKey = annotation.getStructParent();
    if (annotKey == -1)
    {
        annotKey = parentTreeNextKey + 1;
        annotation.setStructParent(annotKey);
        structureTreeRoot.setParentTreeNextKey(annotKey + 1);
    }
    numberTreeAsMap.put(annotKey, anDict);
    parentTree = new PDNumberTreeNode(PDParentTreeValue.class);
    parentTree.setNumbers(numberTreeAsMap);
    structureTreeRoot.setParentTree(parentTree);
    

    getNumberTreeAsMap is a convenience method found in PDFMergerUtility because the available API doesn't retrieve the whole tree as once.

    static Map<Integer, COSObjectable> getNumberTreeAsMap(PDNumberTreeNode tree)
            throws IOException
    {
        if (tree == null)
        {
            return new LinkedHashMap<>();
        }
        Map<Integer, COSObjectable> numbers = tree.getNumbers();
        if (numbers == null)
        {
            numbers = new LinkedHashMap<>();
        }
        else
        {
            // must copy because the map is read only
            numbers = new LinkedHashMap<>(numbers);
        }
        List<PDNumberTreeNode> kids = tree.getKids();
        if (kids != null)
        {
            for (PDNumberTreeNode kid : kids)
            {
                numbers.putAll(getNumberTreeAsMap(kid));
            }
        }
        return numbers;
    }
    

    VeraPDF complains about an incorrect /CIDSet entry, see here on how to fix this