htmlbeautifulsoupdocxpython-docx

How to convert html to docx for a table with nested tables?


I want to convert any html with a table that has nested tables in its cells.

When I try to do this, additional rows appear after the rows in which the nested tables were located. The number of rows after the embedded tables is equal to the sum of the rows of embedded tables in the previous row, and I just need him to insert the table into the cell

Here is my conversion code:

from docx import Document
from bs4 import BeautifulSoup

class HtmlToDocx:
    def __init__(self):
        self.document = Document()

    def handle_table(self, table_soup, parent_docx_table=None, row_idx=0, col_idx=0):
        
        rows, cols = self.get_table_dimensions(table_soup)
        
      
        if parent_docx_table:
            cell = parent_docx_table.cell(row_idx, col_idx)
            docx_table = cell.add_table(rows=rows, cols=cols)
        else:
            docx_table = self.document.add_table(rows=rows, cols=cols)
    
        rows = self.get_table_rows(table_soup)
        cell_row = 0
        for row in rows:
            cols = self.get_table_columns(row)
            cell_col = 0
            for col in cols:
                nested_table = col.find('table') 
                docx_cell = docx_table.cell(cell_row, cell_col)
    
                if nested_table:
                    
                    self.handle_table(nested_table, docx_table, cell_row, cell_col)
                    
                    cell_col += 1  
                    continue  

                cell_html = self.get_cell_html(col)
                docx_cell.text = cell_html
    
                cell_col += 1
            cell_row += 1


    def get_table_rows(self, table_soup):
        return table_soup.find_all('tr')

    def get_table_columns(self, row):
        return row.find_all(['th', 'td'], recursive=False)

    def get_cell_html(self, soup):
        return ''.join([str(i) for i in soup.contents if not (i.name == 'table' or isinstance(i, BeautifulSoup))])

    def get_table_dimensions(self, table_soup):
        rows = self.get_table_rows(table_soup)
        cols = self.get_table_columns(rows[0]) if rows else []
        return len(rows), len(cols)


    def add_html_to_docx(self, html_content, output_file):
        self.soup = BeautifulSoup(html_content, "html.parser")
        top_tables = self.soup.find_all('table', recursive=False)
        for table_soup in top_tables:
            self.handle_table(table_soup)
        self.document.save(output_file)

html_content = """
   
    <table style="border-collapse: collapse; width: 100%; height: 232px;" border="1">
        <tbody>
        <tr style="height: 22px;">
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        </tr>
        <tr style="height: 188px;">
        <td style="width: 23.7413%; height: 188px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 188px;">
        </td>
        <td style="width: 23.7413%; height: 188px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">ааа</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
        </td>
        <td style="width: 23.7413%; height: 188px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 27.8655%;">е</td>
        <td style="width: 27.8655%;">&nbsp;</td>
        <td style="width: 27.8742%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 27.8655%;">&nbsp;</td>
        <td style="width: 27.8655%;">еее</td>
        <td style="width: 27.8742%;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
        </td>
        </tr>
        <tr style="height: 22px;">
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 19.5803%;">11</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5889%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">222</td>
        <td style="width: 19.5889%;">333</td>
        </tr>
        </tbody>
        </table>
        </td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
"""
converter = HtmlToDocx()
converter.add_html_to_docx(html_content, "output.docx")

And in the image is what I get: enter image description here


Solution

  • It was just necessary to pass the id for each nested table

    from docx import Document
    from bs4 import BeautifulSoup
    import uuid
    
    class HtmlToDocx:
        def __init__(self):
            self.document = Document()
    
    def add_unique_ids_to_tables(self, soup):
        for table in soup.find_all('table'):
            table['data-table-id'] = str(uuid.uuid4())
    
    def handle_table(self, table_soup, parent_docx_table=None, row_idx=0, col_idx=0):
        rows, cols = self.get_table_dimensions(table_soup)
        print(rows, cols)
        if parent_docx_table:
            cell = parent_docx_table.cell(row_idx, col_idx)
            docx_table = cell.add_table(rows=rows, cols=cols)
        else:
            docx_table = self.document.add_table(rows=rows, cols=cols)
    
        rows = self.get_table_rows(table_soup)
        cell_row = 0
        for row in rows:
            cols = self.get_table_columns(row)
            cell_col = 0
            for col in cols:
                nested_table = col.find('table')
                docx_cell = docx_table.cell(cell_row, cell_col)
    
                if nested_table:
                    self.handle_table(nested_table, docx_table, cell_row, cell_col)
                else:
                    cell_html = self.get_cell_html(col)
                    docx_cell.text = cell_html.strip()
    
                cell_col += 1
            cell_row += 1
    
    def get_table_rows(self, table_soup):
        table_id = table_soup.get('data-table-id')
        return [tr for tr in table_soup.find_all('tr') if tr.find_parent('table')['data-table-id'] == table_id]
    
    def get_table_columns(self, row):
        return row.find_all(['th', 'td'], recursive=False)
    
    def get_cell_html(self, soup):
        return ''.join([str(i) for i in soup.contents if i.name != 'table'])
    
    def get_table_dimensions(self, table_soup):
        rows = self.get_table_rows(table_soup)
        if not rows:
            return 0, 0  
    
        cols = self.get_table_columns(rows[0])
        return len(rows), len(cols)
    
    def add_html_to_docx(self, html_content, output_file):
        self.soup = BeautifulSoup(html_content, "html.parser")
        self.add_unique_ids_to_tables(self.soup)
        top_tables = self.soup.find_all('table', recursive=False)
        for table_soup in top_tables:
            self.handle_table(table_soup)
        self.document.save(output_file)
    
    
    html_content = """
        <table style="border-collapse: collapse; width: 100%; height: 232px;" border="1">
        <tbody>
        <tr style="height: 22px;">
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        </tr>
        <tr style="height: 188px;">
        <td style="width: 23.7413%; height: 188px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 188px;">
        </td>
        <td style="width: 23.7413%; height: 188px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">ааа</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6073%;">&nbsp;</td>
        <td style="width: 14.6246%;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
        </td>
        <td style="width: 23.7413%; height: 188px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 27.8655%;">е</td>
        <td style="width: 27.8655%;">&nbsp;</td>
        <td style="width: 27.8742%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 27.8655%;">&nbsp;</td>
        <td style="width: 27.8655%;">еее</td>
        <td style="width: 27.8742%;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
        </td>
        </tr>
        <tr style="height: 22px;">
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        <td style="width: 23.7413%; height: 22px;">
        <table style="border-collapse: collapse; width: 95.0877%;" border="1">
        <tbody>
        <tr>
        <td style="width: 19.5803%;">11</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5889%;">&nbsp;</td>
        </tr>
        <tr>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">&nbsp;</td>
        <td style="width: 19.5803%;">222</td>
        <td style="width: 19.5889%;">333</td>
        </tr>
        </tbody>
        </table>
        </td>
        <td style="width: 23.7413%; height: 22px;">&nbsp;</td>
        </tr>
        </tbody>
        </table>
    """
    
    converter = HtmlToDocx()
    converter.add_html_to_docx(html_content, "output.docx")