pythonxpathlxmlpython-docx

Cannot extract element xpath from docx


With python-docx-oss I use the following code (I want to write the Heading 3 style to a TXT file and include only the outline/level of numbering that is level 3, i.e. x.x.x):

from docx import Document
from docx.oxml.ns import qn

def docx_to_txt(input_path, output_path):
    doc = Document(input_path)
    output_lines = []
    capture_heading_3_content = False

    def is_heading_3(paragraph):
        # Check if the paragraph is Heading 3 (Outline level 3 in Word)
        if paragraph.style.name == 'Heading 3':
            outline_lvl = paragraph._element.xpath('.//w:pPr/w:pStyle/w:numPr/w:ilvl/@w:val', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})            
            print(outline_lvl)
            return outline_lvl and outline_lvl[0] == '2'  # Outline level 3 is represented by '2' in Word
        return False

    for element in doc.element.body:
        if element.tag.endswith('p'):  # Paragraph
            para = next(p for p in doc.paragraphs if p._element == element)
            if is_heading_3(para):
                capture_heading_3_content = True
                output_lines.append(para.text + '\n')
            elif capture_heading_3_content:
                output_lines.append('TEXT')

    # Write to output file
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in output_lines:
            if line.strip():  # To avoid writing empty lines
                f.write(line)

input_docx = 'demo.docx'
output_txt = 'demo.txt'
docx_to_txt(input_docx, output_txt)

I can extract the style which is Heading 3 using.

paragraph._element.xpath('.//w:pPr/w:pStyle/@w:val', 

But the

paragraph._element.xpath('.//w:pPr/w:pStyle/w:numPr/w:ilvl/@w:val', 

does not work (also when replacing ilvl with outlineLvl)

The document.xml (extracted from the docx zip) is as follows:

space="preserve"> </w:t></w:r></w:p><w:p w14:paraId="51F57A9F" 
w14:textId="77777777" w:rsidR="00B23BA1" w:rsidRPr="00107F54" 
w:rsidRDefault="00B23BA1" w:rsidP="00B23BA1"><w:r 
w:rsidRPr="00107F54"><w:t>In the next paragraphs, ...</w:t></w:r></
w:p><w:p w14:paraId="47ED129E" w14:textId="60ED4712" w:rsidR="00E81AC6"
 w:rsidRPr="00E81AC6" w:rsidRDefault="00E81AC6" 
w:rsidP="00B23BA1">
<w:pPr><w:pStyle w:val="Heading3"/><w:numPr><w:ilvl w:val="2"/><w:numId w:val="5"/></w:numPr></w:pPr>
<w:bookmarkStart 
w:id="140" w:name="_Toc166773442"/><w:r w:rsidRPr="00E81AC6"><w:t>Increased risk</w:t></w:r><w:bookmarkEnd 
w:id="140"/></w:p><w:tbl><w:tblPr><w:tblStyle w:val="TableGrid"/
><w:tblW w:w="0" w:type="auto"/><w:tblLook w:val="04A0" w:firstRow="1"
 w:lastRow="0" w:firstColumn="1" w:lastColumn="0" w:noHBand="0" 
w:noVBand="1"/><w:tblCaption w:val="Finding"/
><w:tblDescription w:val="Vulnerability identifier."/></w:tblPr><w:tblGrid><w:gridCol 
w:w="2547"/><w:gridCol w:w="6469"/></w:tblGrid><w:tr w:rsidR="00E81AC6"
 w:rsidRPr="00E8

Why I cannot get the ilvl value of 2?


Solution

  • pStyle is a sibling of numPr, not the parent. The xpath should be

    .//w:pPr/w:numPr/w:ilvl/@w:val

    <w:pPr>
      <w:pStyle w:val="Heading3"/>
      <w:numPr>
        <w:ilvl w:val="2"/>
        <w:numId w:val="5"/>
      </w:numPr>
    </w:pPr>