I have a docx file with table graph which can not recognized by doc.tables. Here is the file: https://github.com/python-openxml/python-docx/files/1867861/non_readable_table.docx
Same issue was encountered here. But no answer was given. Please let me know if you have any solution.
from docx import Document
doc = Document("non_readable_table.docx")
print(doc.tables)
def iter_tables(block_item_container):
"""Recursively generate all tables in `block_item_container`."""
for t in block_item_container.tables:
yield t
for row in t.rows:
for cell in row.cells:
yield from iter_tables(cell)
dfs = []
for t in iter_tables(doc):
table = t
df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
if cell.text:
df[i][j] = cell.text.replace('\n', '')
dfs.append(pd.DataFrame(df))
print(dfs)
The non_readable_table.docx
is a little bit strange as there is no content in default body elements. All content - the both tables - are in text boxes and therefore are text box content.
Python-docx is not aware of text boxes. Only inline shapes are supported. But python-docx retains the full XML of the source *.docx
file. So one can get the text box content out of the XML using XML methods.
The following example code does this and retrieves tables from the text box content if they exist.
from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table
from docx.text.hyperlink import Hyperlink
from docx.text.run import Run
def get_tables_in_text_box_elements(run_element: Run) -> [Table]:
tables = []
txbxContent_elements = run_element.element.xpath('./*/*/w:drawing/*/a:graphic/*/*/*/w:txbxContent')
for txbxContent in txbxContent_elements:
table_elements = txbxContent.xpath('.//w:tbl', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
for table_element in table_elements:
table = Table(table_element, None)
tables.append(table)
return tables
document = Document('non_readable_table.docx')
body = document._body
for body_element in body.iter_inner_content():
if isinstance(body_element, Paragraph):
print(f'Paragraph-element: {body_element}')
for run_element in body_element.iter_inner_content():
if isinstance(run_element, Run):
print(f'Run-element: {run_element}, Text: {run_element.text}')
tables = get_tables_in_text_box_elements(run_element)
for table in tables:
print(f'Table-element: {table}')
for row in table.rows:
row_data = []
for cell in row.cells:
for paragraph in cell.paragraphs:
row_data.append(paragraph.text)
print('\t'.join(row_data))
elif isinstance(run_element, Hyperlink):
print(f'Hyperlink-element: {run_element}, Address: {run_element.address}')
else:
print('unknown run element')
elif isinstance(body_element, Table):
print(f'Table-element: {body_element}')
else:
print('unknown body element')