javams-wordapache-poiopenxmlexport-to-pdf

How to replace all docx controls with the text within using Apache POI


I have word docx documents with controls. I cannot use Microsoft Word. I use POI for some tasks inside the word document and I´m using POI also to convert the docx to PDF (fr.opensagres.poi.xwpf.converter.pdf.2.0.1 with poi 3.17 - they work well together). I know it is not the last release of POI but 3.17 version works well with opensagres last version I found 2.0.1. It has to be java open source free software.

The conversion works well except for the text inside the controls. It ignores them. So I want to delete the controls and replace them for the text they content before doing the conversion. It is very easy with visual basic but I do not have access to word. In visual basic it is done by going over all the ContentControls of the document and deleting them.

ForAll c In wordDoc.ContentControls
c.Delete
End ForAll

The controls are deleted and replaced by their contents. But I´m lost in POI.

It is not finding some text and replacing it. It is finding all controls and replacing them (deleting the control) with the text within them keeping the format they are in. I found this Read sections from Word documents with Apache POI And I can see the list of all the controls by their names and contents using AbstractXWPFSDT, but it is read-only and I cannot change anything. I could not find the way to get those controls in a way I can manage them.

Any help? Thanks a lot


Solution

  • It is true that XWPFAbstractSDT as well as XWPFSDT and XWPFSDTCell are not very comfortable to use. So I decided to forego these when it comes to handle Microsot Word content controls. I get all those content controls as XmlObject using pure XML methods from XML of document.xml.

    Code:

     List<XmlObject> extractSDTsFromBody(XWPFDocument document) {
      XmlCursor xmlcursor = document.getDocument().getBody().newCursor();
      QName qnameSdt = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main", "sdt", "w");
      List<XmlObject> allsdts = new ArrayList<XmlObject>();
      while (xmlcursor.hasNextToken()) {
       XmlCursor.TokenType tokentype = xmlcursor.toNextToken();
       if (tokentype.isStart()) {
        if (qnameSdt.equals(xmlcursor.getName())) {
         if (xmlcursor.getObject() instanceof XmlObject) {
          allsdts.add(xmlcursor.getObject());
         }
        } 
       }
      }
      return allsdts;
     }
    

    Having that List<XmlObject> allsdts one can get CTSdtBlock, CTSdtRun and CTSdtCell depending on whether it is a block content control, a inline content control or a table cell content control.

    A inline content control sdt-XML-element looks like so:

    <w:sdt> <!-- structured document tags -->
     <w:sdtPr> ... </w:sdtPr> <!-- sdt properties -->
     <w:sdtContent> <!-- sdt content-->
      <w:r> <!-- text run-->
      ...
       <w:t>Text content</w:t> <!-- text -->
      </w:r>
      ... <!-- further text runs-->
     </w:sdtContent>
    </w:sdt>
    

    To get only the text runs out of the sdt content one can unset the sdt properties, then move the XML <w:r>...</w:r> parts out of the <w:sdt><w:sdtContent> ... </w:sdtContent></w:sdt> XML. After that the <w:sdt><w:sdtContent> ... </w:sdtContent></w:sdt> is empty and can be removed.

    Code:

     void replaceContentControls(List<XmlObject> allsdts) {
      try {
       for (XmlObject object : allsdts) {
        if (object instanceof CTSdtBlock) {
         CTSdtBlock ctSdtBlock = (CTSdtBlock)object;
         ctSdtBlock.unsetSdtPr();
         XmlCursor toHere = ctSdtBlock.newCursor();
         int count = ctSdtBlock.getSdtContent().getPArray().length;
         for (int i = 0; i < count; i++) {
          CTP par = ctSdtBlock.getSdtContent().getPArray(0);
          XmlCursor runCursor = par.newCursor();
          runCursor.moveXml(toHere);
         }
         XmlCursor cursor = ctSdtBlock.newCursor();
         cursor.removeXml();          
        } else if (object instanceof CTSdtRun) {
         CTSdtRun ctSdtRun = (CTSdtRun)object;
         ctSdtRun.unsetSdtPr();
         XmlCursor toHere = ctSdtRun.newCursor();
         int count = ctSdtRun.getSdtContent().getRArray().length;
         for (int i = 0; i < count; i++) {
          CTR run = ctSdtRun.getSdtContent().getRArray(0);
          XmlCursor runCursor = run.newCursor();
          runCursor.moveXml(toHere);
         }         
         XmlCursor cursor = ctSdtRun.newCursor();
         cursor.removeXml();          
        } else if (object instanceof CTSdtCell) {
         CTSdtCell ctSdtCell = (CTSdtCell)object;
         ctSdtCell.unsetSdtPr();
         XmlCursor toHere = ctSdtCell.newCursor();
         int count = ctSdtCell.getSdtContent().getTcArray().length;
         for (int i = 0; i < count; i++) {
          CTTc cell = ctSdtCell.getSdtContent().getTcArray(0);
          XmlCursor runCursor = cell.newCursor();
          runCursor.moveXml(toHere);
         }
         XmlCursor cursor = ctSdtCell.newCursor();
         cursor.removeXml();          
        }
       }
      } catch (Exception ex) {
       ex.printStackTrace(); 
      }
     }
    

    Complete Example:

    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    
    import org.apache.poi.xwpf.usermodel.*;
    
    import java.util.List;
    import java.util.ArrayList;
    
    import org.apache.xmlbeans.XmlCursor;
    import org.apache.xmlbeans.XmlObject;
    import javax.xml.namespace.QName;
    
    import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
    
    public class WordReplaceContentControls {
                   
     private static List<XmlObject> extractSDTsFromBody(XWPFDocument document) {
      XmlCursor xmlcursor = document.getDocument().getBody().newCursor();
      QName qnameSdt = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main", "sdt", "w");
      List<XmlObject> allsdts = new ArrayList<XmlObject>();
      while (xmlcursor.hasNextToken()) {
       XmlCursor.TokenType tokentype = xmlcursor.toNextToken();
       if (tokentype.isStart()) {
        if (qnameSdt.equals(xmlcursor.getName())) {
         if (xmlcursor.getObject() instanceof XmlObject) {
          allsdts.add(xmlcursor.getObject());
         }
        } 
       }
      }
      return allsdts;
     }
    
     static void replaceContentControls(List<XmlObject> allsdts) {
      try {
       for (XmlObject object : allsdts) {
        if (object instanceof CTSdtBlock) {
         CTSdtBlock ctSdtBlock = (CTSdtBlock)object;
         ctSdtBlock.unsetSdtPr();
         XmlCursor toHere = ctSdtBlock.newCursor();
         int count = ctSdtBlock.getSdtContent().getPArray().length;
         for (int i = 0; i < count; i++) {
          CTP par = ctSdtBlock.getSdtContent().getPArray(0);
          XmlCursor runCursor = par.newCursor();
          runCursor.moveXml(toHere);
         }
         XmlCursor cursor = ctSdtBlock.newCursor();
         cursor.removeXml();          
        } else if (object instanceof CTSdtRun) {
         CTSdtRun ctSdtRun = (CTSdtRun)object;
         ctSdtRun.unsetSdtPr();
         XmlCursor toHere = ctSdtRun.newCursor();
         int count = ctSdtRun.getSdtContent().getRArray().length;
         for (int i = 0; i < count; i++) {
          CTR run = ctSdtRun.getSdtContent().getRArray(0);
          XmlCursor runCursor = run.newCursor();
          runCursor.moveXml(toHere);
         }         
         XmlCursor cursor = ctSdtRun.newCursor();
         cursor.removeXml();          
        } else if (object instanceof CTSdtCell) {
         CTSdtCell ctSdtCell = (CTSdtCell)object;
         ctSdtCell.unsetSdtPr();
         XmlCursor toHere = ctSdtCell.newCursor();
         int count = ctSdtCell.getSdtContent().getTcArray().length;
         for (int i = 0; i < count; i++) {
          CTTc cell = ctSdtCell.getSdtContent().getTcArray(0);
          XmlCursor runCursor = cell.newCursor();
          runCursor.moveXml(toHere);
         }
         XmlCursor cursor = ctSdtCell.newCursor();
         cursor.removeXml();          
        }
       }
      } catch (Exception ex) {
       ex.printStackTrace(); 
      }
     }
    
     public static void main(String[] args) {
         
      try (XWPFDocument document = new XWPFDocument(new FileInputStream("./WordFormContentControls.docx"));
           FileOutputStream out = new FileOutputStream("./WordFormContentControlsResult.docx"); ) {
      
       List<XmlObject> allsdts = extractSDTsFromBody(document);
    
       replaceContentControls(allsdts);
      
       document.write(out);
       
      } catch (Exception ex) {
       ex.printStackTrace(); 
      }
     }
    }
    

    If ./WordFormContentControls.docx contains content controls, then after running the code, ./WordFormContentControlsResult.docx shoud contain only the text without the controls.