pythonms-wordpython-docx

How to read and input data into a table graph in an template word file using python docx


I have a docx file with table graph which can not recognized by doc.tables. Here is the file: https://github.com/python-openxml/python-docx/files/1867861/non_readable_table.docx

Same issue was encountered here. But no answer was given. Please let me know if you have any solution.

from docx import Document

doc = Document("non_readable_table.docx")

print(doc.tables)

def iter_tables(block_item_container):
    """Recursively generate all tables in `block_item_container`."""
    for t in block_item_container.tables:
        yield t
        for row in t.rows:
            for cell in row.cells:
                yield from iter_tables(cell)


dfs = []
for t in iter_tables(doc):
    table = t
    df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
    for i, row in enumerate(table.rows):
        for j, cell in enumerate(row.cells):
            if cell.text:
                df[i][j] = cell.text.replace('\n', '')
    dfs.append(pd.DataFrame(df))

print(dfs)

Solution

  • The non_readable_table.docx is a little bit strange as there is no content in default body elements. All content - the both tables - are in text boxes and therefore are text box content.

    Python-docx is not aware of text boxes. Only inline shapes are supported. But python-docx retains the full XML of the source *.docx file. So one can get the text box content out of the XML using XML methods.

    The following example code does this and retrieves tables from the text box content if they exist.

    from docx import Document
    from docx.text.paragraph import Paragraph
    from docx.table import Table
    from docx.text.hyperlink import Hyperlink
    from docx.text.run import Run
    
    def get_tables_in_text_box_elements(run_element: Run) -> [Table]:
        tables = []
        txbxContent_elements = run_element.element.xpath('./*/*/w:drawing/*/a:graphic/*/*/*/w:txbxContent')
        for txbxContent in txbxContent_elements:
            table_elements = txbxContent.xpath('.//w:tbl', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
            for table_element in table_elements:
                table = Table(table_element, None)
                tables.append(table)
        return tables
            
    document = Document('non_readable_table.docx')
    
    body = document._body
    
    for body_element in body.iter_inner_content():
        if isinstance(body_element, Paragraph):
            print(f'Paragraph-element: {body_element}')
            for run_element in body_element.iter_inner_content():
                if isinstance(run_element, Run):
                    print(f'Run-element: {run_element}, Text: {run_element.text}')
                    
                    tables = get_tables_in_text_box_elements(run_element)
                    for table in tables:
                        print(f'Table-element: {table}')
                        for row in table.rows:
                            row_data = []
                            for cell in row.cells:
                                for paragraph in cell.paragraphs:
                                    row_data.append(paragraph.text)
                            print('\t'.join(row_data))
                                
                elif isinstance(run_element, Hyperlink):
                    print(f'Hyperlink-element: {run_element}, Address: {run_element.address}')
                else:
                    print('unknown run element')
                    
    
        elif isinstance(body_element, Table):
            print(f'Table-element: {body_element}')
        else:
            print('unknown body element')