javaspringms-wordapache-poi

How can I extract a section from Word document with Apache POI?


I´m using Apache POI (XWPF) with Spring Boot in Java 11.

I need to extract the section number 2 (title and content) from a Word document with the following numerated list:

Word example

I would like to know how to get only section 2 and its content to create a new Word document with only that part.

Currently it loops and prints the text. Is there any way to access section number two through the number of the sorted list?

 private void extractAllParagraphs(){

    //Get documet
    XWPFDocument doc = new XWPFDocument(OPCPackage.open("path..."));

    String textPart = "";

    // loop all paragraphs
    List<XWPFParagraph> xwpfParagraphList = doc.getParagraphs();

    for (XWPFParagraph p : doc.getParagraphs()) {
        //Get paragraph runs
        List<XWPFRun> runs = p.getRuns();

        //Loop runs of the paragraph
        for(int i = 0; i<runs.size(); i++) {
            textPart += runs.get(i).toString();
        }
        System.out.println(textPart);
    }
}

Solution

  • XWPF of apache poi does not support numbering in Word very well. So handling numbering is not really straight forward.

    In Word numbered paragraphs have a num-id and the numbering level set in document. This num-id refers to a numbering in a separate numbering document part. There the numbering type (decimal, letter, roman, ...) and the numbering format is defined. The actual numbering of the paragraphs is determined by this num-id, numbering level, numbering type, numbering format and the count of paragraphs having the same num-id before in document. So it is really complex to manage numbering while reading a Word document.

    The following working draft shows one example for how to manage numbering while reading a Word document using apache poi. It is a working draft to show the principle using as less code as possible. It uses memory structure for storing the numbering level counter and the the previous numbering level in document. The code is commented additionally to show what it does.

    import java.io.FileInputStream;
    import org.apache.poi.xwpf.usermodel.*;
    
    import java.util.List;
    import java.util.Map;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.math.BigInteger;
    
    public class WordReader {
    
     //memory structure for storing the numbering level counter
     private Map<Integer, Map<Integer, Integer>> numIDLvlCnt = new HashMap<Integer, Map<Integer, Integer>>();
     //memory structure for storing the previous numbering level
     private Map<Integer, Integer> numIDPrevNumIlv = new HashMap<Integer,Integer>();
    
     private StringBuilder content = new StringBuilder();
    
     private void traverseBodyElements(List<IBodyElement> bodyElements, boolean crlf) throws Exception {
      for (IBodyElement bodyElement : bodyElements) {
       if (bodyElement instanceof XWPFParagraph) {
        XWPFParagraph paragraph = (XWPFParagraph)bodyElement;
        //System.out.println(paragraph);
        //ToDo: Do something with paragraph.
        String no = "";
        if (paragraph.getNumID() != null) { //if paragraph has numbering
         no = getCurrentNumber(paragraph);
        }
        //print paragraph, if numbered then with leading number
        content.append("<p>");
        if (no.length() > 0) content.append(no + " ");
        content.append(paragraph.getText());
        content.append("</p>");
        if (crlf) content.append("\r\n");
       } else if (bodyElement instanceof XWPFTable) {
        XWPFTable table = (XWPFTable)bodyElement;
        //System.out.println(table);
        content.append("<table>");
        content.append("\r\n");
        traverseTableRows(table.getRows());
        content.append("</table>");
        content.append("\r\n");
       } // ToDo: else ...
      }
     }
    
     private void traverseTableRows(List<XWPFTableRow> tableRows) throws Exception {
      for (XWPFTableRow tableRow : tableRows) {
       //System.out.println(tableRow);
       content.append("<tr>");
       traverseTableCells(tableRow.getTableICells());
       content.append("</tr>");
       content.append("\r\n");
      }
     }
    
     private void traverseTableCells(List<ICell> tableICells) throws Exception {
      for (ICell tableICell : tableICells) {
       if (tableICell instanceof XWPFTableCell) {
        XWPFTableCell tableCell = (XWPFTableCell)tableICell;
        //System.out.println(tableCell);
        content.append("<td>");
        traverseBodyElements(tableCell.getBodyElements(), false);
        content.append("</td>");
       } // ToDo: else ...
      }
     }
    
     //set numbering level counter for current numbering ID and numbering level
     private void setNumIDLvlCnt(Integer numID, Integer numIlvl) {
      if (numID != null) {
       //get level counter for numbering ID
       Map<Integer, Integer> lvlCnt = numIDLvlCnt.get(numID);
       if (lvlCnt == null) { //if there is no level counter, create a new one
        lvlCnt = new HashMap<Integer, Integer>();
        numIDLvlCnt.put(numID, lvlCnt);
       }
       Integer prevNumIlv = numIDPrevNumIlv.get(numID);
       if (prevNumIlv == null) {
        prevNumIlv = 0;
        numIDPrevNumIlv.put(numID, prevNumIlv);
       }
       if (numIlvl != null) {
        //if this level is lower than the previous one, then all deeper level counters needs starting new
        if (numIlvl < prevNumIlv) {
         /*
         for(Iterator<Integer> iterator = lvlCnt.keySet().iterator(); iterator.hasNext(); ) {
          Integer ilvl = iterator.next();
          if (ilvl > numIlvl) {
           iterator.remove();
          }
         }
         */
         lvlCnt.keySet().removeIf(ilvl -> ilvl > numIlvl);
        }
        //get current counter for level
        Integer cnt = lvlCnt.get(numIlvl);
        if (cnt == null) { //if there is no counter, set 0
         lvlCnt.put(numIlvl, 0);
        }
        cnt = lvlCnt.get(numIlvl);
        lvlCnt.put(numIlvl, cnt + 1); //count up 1
        prevNumIlv = numIlvl; //set this level to be the previous level
        numIDPrevNumIlv.put(numID, prevNumIlv);
       }
      }
      //System.out.println(numIDLvlCnt);
      //System.out.println(numIDPrevNumIlv);
     }
    
     //get formatted number from number format and level counter
     private String getNoFromCount(String numFmt, Integer cnt) {
      String no = "";
      if ("DECIMAL".equalsIgnoreCase(numFmt)) {
       no = String.valueOf(cnt);
      } else if ("LOWERLETTER".equalsIgnoreCase(numFmt)) {
       no = Character.toString(96 + cnt); //should be done better
      } else if ("LOWERROMAN".equalsIgnoreCase(numFmt)) {
       String[] romans = new String[]{"", "i", "ii", "iii", "iv", "v"};
       if (cnt < romans.length) no = romans[cnt]; //should be done better
      } else if ("UPPERROMAN".equalsIgnoreCase(numFmt)) {
       String[] romans = new String[]{"", "I", "II", "III", "IV", "V"};
       if (cnt < romans.length) no = romans[cnt]; //should be done better
      } //ToDo: else ...
      return no;
     }
    
     //get current number from paragraph
     private String getCurrentNumber(XWPFParagraph paragraph) {
      String no = "";
    
      BigInteger numStartOverride = paragraph.getNumStartOverride(); //ToDo: to take into account
      //System.out.println(numStartOverride);
    
      //get numbering format
      String numFmt = paragraph.getNumFmt(); //decimal, lowerletter, roman, ..
    
      //get numbering ID
      BigInteger numID = paragraph.getNumID();
      //get current numbering level
      BigInteger numIlvl = paragraph.getNumIlvl();
      //set numbering level counter for current numbering ID and numbering level
      setNumIDLvlCnt(numID.intValue(), numIlvl.intValue());
      //get level counter for this numbering ID
      Map<Integer, Integer> lvlCnt = numIDLvlCnt.get(numID.intValue());
      //get numbering level text
      String numLevelText = paragraph.getNumLevelText(); // %1.%2.%3...
      no = numLevelText;
      for (Integer ilvl : lvlCnt.keySet()) {
       int i = ilvl + 1;
       //replace the placeholders %1, %2, %3, ... with formatted number from number format and level counter
       no = no.replace("%"+i, getNoFromCount(numFmt, lvlCnt.get(ilvl)));
      }
      return no;
     }
    
     public void read(String inFilePath) throws Exception {
      XWPFDocument document = new XWPFDocument(new FileInputStream(inFilePath));
      traverseBodyElements(document.getBodyElements(), true);
      document.close();
      System.out.println(content);
     }
    
     public static void main(String[] args) throws Exception {
      String inFilePath = "./WordDocument.docx";
      WordReader reader = new WordReader();
      reader.read(inFilePath);
     }
    }