javajava-8apache-poixwpf

The value "name" and "surname" aren't read apache poi


My purpose is to read a file docx and take this text "#name#" and "#surname#" and change the value with another casual text:

This is my docx file: docx

I do this:

XWPFDocument docx = new XWPFDocument(OPCPackage.open("..."));
  
            for (XWPFParagraph p : docx.getParagraphs()) {
                List<XWPFRun> runs = p.getRuns();
            
                if (runs != null) {
                    
                    for (XWPFRun r : runs) {
                        String text = r.getText(0);
                        if (text != null && text.startsWith("#") && text.endsWith("#")) {
                            text = text.replace("#", "new ");
                            r.setText(text, 0);
                        }
                      
                    }
                }
                
            }
            for (XWPFTable tbl : docx.getTables()) {
                   for (XWPFTableRow row : tbl.getRows()) {
                      for (XWPFTableCell cell : row.getTableCells()) {
                         for (XWPFParagraph p : cell.getParagraphs()) {
                   

          
                        for (XWPFRun r : p.getRuns()) {
                          String text = r.getText(0);
                          if (text != null && text.startsWith("#") && text.endsWith("#")) {
                            text = text.replace("#", "new ");
                            r.setText(text,0);
                          }
                        }
                     }
                  }
               }
    

the problem is that my code reads all label in docx file but it doesn't read the label "#surname#" and "#name". Anyone can help me?


Solution

  • From your screenshot it looks like the "#name#" and "#suremane#" are not in the document body directly but in a drawing (a text-box for example or a shape). Such elements are not covered by XWPFDocument.getParagraphs or .getTables or any other high level method in apache poi. So your main problem will be that the paragraphs which contain your text simply are not traversed by your code.

    The only way to get really all paragraphs out of the documents body is using a XmlCursor which selects all w:p elements from the XML directly.

    The code below shows that. It traverses really all XWPFParagraphs in documents body using a XmlCursor and replaces text if found.

    For the replacement process I prefer the TextSegment replacement approach shown in Apache POI: ${my_placeholder} is treated as three different runs already. This is necessary because, even if the containing paragraph gets traversed, the text could be separated in different text runs because of formatting, spell checking or any other strange reasons. Microsoft Word knows nearly infinity reasons to strangely split text into different text runs.

    import java.io.*;
    import org.apache.poi.xwpf.usermodel.*;
    import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
    
    import org.apache.xmlbeans.XmlObject;
    import org.apache.xmlbeans.XmlCursor;
    
    import java.util.Map;
    import java.util.HashMap;
    import java.util.List;
    import java.util.ArrayList;
    
    public class WordReplaceTextSegment {
        
        /**
         * this methods parse the paragraph and search for the string searched.
         * If it finds the string, it will return true and the position of the String
         * will be saved in the parameter startPos.
         *
         * @param searched
         * @param startPos
         */
        static TextSegment searchText(XWPFParagraph paragraph, String searched, PositionInParagraph startPos) {
            int startRun = startPos.getRun(),
                startText = startPos.getText(),
                startChar = startPos.getChar();
            int beginRunPos = 0, candCharPos = 0;
            boolean newList = false;
    
            //CTR[] rArray = paragraph.getRArray(); //This does not contain all runs. It lacks hyperlink runs for ex.
            java.util.List<XWPFRun> runs = paragraph.getRuns(); 
            
            int beginTextPos = 0, beginCharPos = 0; //must be outside the for loop
            
            //for (int runPos = startRun; runPos < rArray.length; runPos++) {
            for (int runPos = startRun; runPos < runs.size(); runPos++) {
                //int beginTextPos = 0, beginCharPos = 0, textPos = 0, charPos; //int beginTextPos = 0, beginCharPos = 0 must be outside the for loop
                int textPos = 0, charPos;
                //CTR ctRun = rArray[runPos];
                CTR ctRun = runs.get(runPos).getCTR();
                XmlCursor c = ctRun.newCursor();
                c.selectPath("./*");
                try {
                    while (c.toNextSelection()) {
                        XmlObject o = c.getObject();
                        if (o instanceof CTText) {
                            if (textPos >= startText) {
                                String candidate = ((CTText) o).getStringValue();
                                if (runPos == startRun) {
                                    charPos = startChar;
                                } else {
                                    charPos = 0;
                                }
    
                                for (; charPos < candidate.length(); charPos++) {
                                    if ((candidate.charAt(charPos) == searched.charAt(0)) && (candCharPos == 0)) {
                                        beginTextPos = textPos;
                                        beginCharPos = charPos;
                                        beginRunPos = runPos;
                                        newList = true;
                                    }
                                    if (candidate.charAt(charPos) == searched.charAt(candCharPos)) {
                                        if (candCharPos + 1 < searched.length()) {
                                            candCharPos++;
                                        } else if (newList) {
                                            TextSegment segment = new TextSegment();
                                            segment.setBeginRun(beginRunPos);
                                            segment.setBeginText(beginTextPos);
                                            segment.setBeginChar(beginCharPos);
                                            segment.setEndRun(runPos);
                                            segment.setEndText(textPos);
                                            segment.setEndChar(charPos);
                                            return segment;
                                        }
                                    } else {
                                        candCharPos = 0;
                                    }
                                }
                            }
                            textPos++;
                        } else if (o instanceof CTProofErr) {
                            c.removeXml();
                        } else if (o instanceof CTRPr) {
                            //do nothing
                        } else {
                            candCharPos = 0;
                        }
                    }
                } finally {
                    c.dispose();
                }
            }
            return null;
        }
    
     static void replaceTextSegment(XWPFParagraph paragraph, String textToFind, String replacement) {
      TextSegment foundTextSegment = null;
      PositionInParagraph startPos = new PositionInParagraph(0, 0, 0);
      //while((foundTextSegment = paragraph.searchText(textToFind, startPos)) != null) { // search all text segments having text to find
      while((foundTextSegment = searchText(paragraph, textToFind, startPos)) != null) { // search all text segments having text to find
    
    System.out.println(foundTextSegment.getBeginRun()+":"+foundTextSegment.getBeginText()+":"+foundTextSegment.getBeginChar());
    System.out.println(foundTextSegment.getEndRun()+":"+foundTextSegment.getEndText()+":"+foundTextSegment.getEndChar());
    
       // maybe there is text before textToFind in begin run
       XWPFRun beginRun = paragraph.getRuns().get(foundTextSegment.getBeginRun());
       String textInBeginRun = beginRun.getText(foundTextSegment.getBeginText());
       String textBefore = textInBeginRun.substring(0, foundTextSegment.getBeginChar()); // we only need the text before
    
       // maybe there is text after textToFind in end run
       XWPFRun endRun = paragraph.getRuns().get(foundTextSegment.getEndRun());
       String textInEndRun = endRun.getText(foundTextSegment.getEndText());
       String textAfter = textInEndRun.substring(foundTextSegment.getEndChar() + 1); // we only need the text after
    
       if (foundTextSegment.getEndRun() == foundTextSegment.getBeginRun()) { 
        textInBeginRun = textBefore + replacement + textAfter; // if we have only one run, we need the text before, then the replacement, then the text after in that run
       } else {
        textInBeginRun = textBefore + replacement; // else we need the text before followed by the replacement in begin run
        endRun.setText(textAfter, foundTextSegment.getEndText()); // and the text after in end run
       }
    
       beginRun.setText(textInBeginRun, foundTextSegment.getBeginText());
    
       // runs between begin run and end run needs to be removed
       for (int runBetween = foundTextSegment.getEndRun() - 1; runBetween > foundTextSegment.getBeginRun(); runBetween--) {
        paragraph.removeRun(runBetween); // remove not needed runs
       }
    
      }
     }
     
     static List<XmlObject> getCTPObjects(XWPFDocument doc) {
      List<XmlObject> result = new ArrayList<XmlObject>();
      //create cursor selecting all paragraph elements  
      XmlCursor cursor = doc.getDocument().newCursor();
      cursor.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//*/w:p");  
      while(cursor.hasNextSelection()) {
       cursor.toNextSelection();
       XmlObject obj = cursor.getObject();    
       // add only if the paragraph contains at least a run containing text
       if (obj.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' ./w:r/w:t").length > 0) {
        result.add(obj);   
       }
      }
      return result;
     }
     
     static void traverseAllParagraphsAndReplace(XWPFDocument doc, Map<String, String> replacements) throws Exception { 
      //This gets all XWPFParagraph out od the stored XML and replaces 
      //first get all CTP objects
      List<XmlObject> allCTPObjects = getCTPObjects(doc);
      //then traverse them and create XWPFParagraphs from them and do the replacing
      for (XmlObject obj : allCTPObjects) {
       XWPFParagraph paragraph = null;
       if (obj instanceof CTP) {
        CTP p = (CTP)obj;
        paragraph = new XWPFParagraph(p, doc);
       } else {
        CTP p = CTP.Factory.parse(obj.xmlText());  
        paragraph = new XWPFParagraph(p, doc);
       }
       if (paragraph != null) {
        for (String textToFind : replacements.keySet()) {
         String replacement = replacements.get(textToFind);
         if (paragraph.getText().contains(textToFind)) replaceTextSegment(paragraph, textToFind, replacement);
        }
       }
       obj.set(paragraph.getCTP());
      }   
     }
    
     public static void main(String[] args) throws Exception {
    
      XWPFDocument doc = new XWPFDocument(new FileInputStream("source.docx"));
      
      Map<String, String> replacements;
      replacements = new HashMap<String, String>();
      replacements.put("#name#", "Axel");
      replacements.put("#surename#", "Richter");
    
      traverseAllParagraphsAndReplace(doc, replacements);
    
      FileOutputStream out = new FileOutputStream("result.docx");
      doc.write(out);
      out.close();
      doc.close();
    
     }
    }