With python-docx-oss I use the following code (I want to write the Heading 3 style to a TXT file and include only the outline/level of numbering that is level 3, i.e. x.x.x):
from docx import Document
from docx.oxml.ns import qn
def docx_to_txt(input_path, output_path):
doc = Document(input_path)
output_lines = []
capture_heading_3_content = False
def is_heading_3(paragraph):
# Check if the paragraph is Heading 3 (Outline level 3 in Word)
if paragraph.style.name == 'Heading 3':
outline_lvl = paragraph._element.xpath('.//w:pPr/w:pStyle/w:numPr/w:ilvl/@w:val', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
print(outline_lvl)
return outline_lvl and outline_lvl[0] == '2' # Outline level 3 is represented by '2' in Word
return False
for element in doc.element.body:
if element.tag.endswith('p'): # Paragraph
para = next(p for p in doc.paragraphs if p._element == element)
if is_heading_3(para):
capture_heading_3_content = True
output_lines.append(para.text + '\n')
elif capture_heading_3_content:
output_lines.append('TEXT')
# Write to output file
with open(output_path, 'w', encoding='utf-8') as f:
for line in output_lines:
if line.strip(): # To avoid writing empty lines
f.write(line)
input_docx = 'demo.docx'
output_txt = 'demo.txt'
docx_to_txt(input_docx, output_txt)
I can extract the style which is Heading 3 using.
paragraph._element.xpath('.//w:pPr/w:pStyle/@w:val',
But the
paragraph._element.xpath('.//w:pPr/w:pStyle/w:numPr/w:ilvl/@w:val',
does not work (also when replacing ilvl with outlineLvl)
The document.xml (extracted from the docx zip) is as follows:
space="preserve"> </w:t></w:r></w:p><w:p w14:paraId="51F57A9F"
w14:textId="77777777" w:rsidR="00B23BA1" w:rsidRPr="00107F54"
w:rsidRDefault="00B23BA1" w:rsidP="00B23BA1"><w:r
w:rsidRPr="00107F54"><w:t>In the next paragraphs, ...</w:t></w:r></
w:p><w:p w14:paraId="47ED129E" w14:textId="60ED4712" w:rsidR="00E81AC6"
w:rsidRPr="00E81AC6" w:rsidRDefault="00E81AC6"
w:rsidP="00B23BA1">
<w:pPr><w:pStyle w:val="Heading3"/><w:numPr><w:ilvl w:val="2"/><w:numId w:val="5"/></w:numPr></w:pPr>
<w:bookmarkStart
w:id="140" w:name="_Toc166773442"/><w:r w:rsidRPr="00E81AC6"><w:t>Increased risk</w:t></w:r><w:bookmarkEnd
w:id="140"/></w:p><w:tbl><w:tblPr><w:tblStyle w:val="TableGrid"/
><w:tblW w:w="0" w:type="auto"/><w:tblLook w:val="04A0" w:firstRow="1"
w:lastRow="0" w:firstColumn="1" w:lastColumn="0" w:noHBand="0"
w:noVBand="1"/><w:tblCaption w:val="Finding"/
><w:tblDescription w:val="Vulnerability identifier."/></w:tblPr><w:tblGrid><w:gridCol
w:w="2547"/><w:gridCol w:w="6469"/></w:tblGrid><w:tr w:rsidR="00E81AC6"
w:rsidRPr="00E8
Why I cannot get the ilvl value of 2?
pStyle
is a sibling of numPr
, not the parent. The xpath should be
.//w:pPr/w:numPr/w:ilvl/@w:val
<w:pPr>
<w:pStyle w:val="Heading3"/>
<w:numPr>
<w:ilvl w:val="2"/>
<w:numId w:val="5"/>
</w:numPr>
</w:pPr>