pythonparsingasciipython-reprettytable

Parse ascii table header


So I need to parse this into dataframe or list:

tmp =
['+--------------+-----------------------------------------+',
 '| Something to |        Some header with subheader       |',
 '|  watch or    +-----------------+-----------------------+',
 '|     idk      |      First      |   another text again  |',
 '|              |                 |  with one more line   |',
 '|              |                 +-----------------------+',
 '|              |                 |  and this | how it be |',
 '+--------------+-----------------+-----------------------+']

It is just txt table with strange header. I need to transform it to this:

['Something to watch or idk', 'Some header with subheader First', 'Some header with subheader another text again with one more line and this', 'Some header with subheader another text again with one more line how it be']

Here's my first solution that make me closer to victory (you can see the comments my tries):

pluses = [i for i, element in enumerate(tmp) if element[0] == '+']
tmp2 = tmp[pluses[0]:pluses[1]+1].copy()
table_str=''.join(tmp[pluses[0]:pluses[1]+1])
col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]

tmp3=[]
strt = ''.join(tmp2.copy())
table_list = [l.strip().replace('\n', '') for l in re.split(r'\+[+-]+', strt) if l.strip()]
for row in table_list:
    joined_row = ['' for _ in range(len(row))]
    for lines in [line for line in row.split('||')]:
        line_part = [i.strip() for i in lines.split('|') if i]
        joined_row = [i + j for i, j in zip(joined_row, line_part)]
        tmp3.append(joined_row)

here's out:

tmp3
out[4]:
[['Something to', 'Some header with subheader'],
 ['Something towatch or'],
 ['idk', 'First', 'another text again'],
 ['idk', 'First', 'another text againwith one more line'],
 ['idk'],
 ['', '', 'and this', 'how it be']]

Remains only join this in the right way but idk how to...

Here's addon: We can locate pluses and splitters by this:

col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]
[[0, 15, 57],
 [0, 15, 57],
 [0, 15, 33, 57],
 [0, 15, 33, 57],
 [0, 15, 33, 57],
 [0, 15, 33, 57],
 [0, 15, 33, 45, 57],
 [0, 15, 33, 57]]

And then we can split or group by cell but idk how to too... Please help

Example No.2:

+----------+------------------------------------------------------------+---------------+----------------------------------+--------------------+-----------------------+
|   Number |       longtextveryveryloooooong                            |  aaaaaaaaaaa  |         bbbbbbbbbbbbbbbbbb       |    dfsdfgsdfddd    |qqqqqqqqqqqqqqqqqqqqqq |
| string   |                                                            |               |        ccccccccccccccccccccc     |    affasdd  as     |qqqqqqqqqqqqqqqqqqqqqq |
|          |                                                            |               | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,|    seeerrrr   e,   |   dfsdfffffffffffff   |
|          |                                                            |               | anothertext and something        |       percent      |   ttttttttttttttttt   |
|          |                                                            |               |           (nothingtodo),         |                    | sssssssssssssssssssss |
|          |                                                            |               |             and text             |                    |zzzzzzzzzzzzzzzzzzzzzz |
|          |                                                            |               +----------------------------------+                    | b rererereerr ppppppp |
|          |                                                            |               |     all         | longtext wit-  |                    |                       |
|          |                                                            |               |                 |h many character|                    |                       |
+----------+------------------------------------------------------------+---------------+-----------------+----------------+--------------------+-----------------------+

Solution

  • You could do it recursively - parsing each "sub table" at a time:

    def parse_table(table, header='', root='', table_len=None):
        # store length of original table
        if not table_len:
            table_len = len(table)
    
        # end of current "column"
        col = table[0].find('+', 1)
        rows = [
            row for row in range(1, len(table))
                if  table[row].startswith('+')
                and table[row][col] == '+'
        ]
        row = rows[0]
    
        # split "line" contents into columns
        # end of "line" is either `+` or final `|`
        end = col
        num_cols = table[0].count('+')
        if num_cols != table[1].count('|'):
            end = table[1].rfind('|')
        columns = (line[1:end].split('|') for line in table[1:row])
    
        # rebuild each column appending to header
        content = [
            ' '.join([header] + [line.strip() for line in lines]).strip()
            for lines in zip(*columns)
        ]
    
        # is there a table below?
        if row + 2 < len(table):
            header = content[-1]
            # if we are not the last table - we are a header
            if len(rows) > 1:
                header = content.pop()
            # if we are the first table in column - we are the root 
            if not root:
                root = header
            next_table = [line[:col + 1] for line in table[row:]]
            content.extend(
                parse_table(
                    next_table,
                    header=header,
                    root=root,
                    table_len=table_len
                )
            )
    
        # is there a table to the right?
        if col + 2 < len(table[0]):
            # find start line of next table
            row = next(
                row for row, line in enumerate(table, start=-1)
                    if line[col] == '|'
            )
            next_table = [line[col:] for line in table[row:]]
            # new top-level table - reset root
            if len(next_table) == table_len:
                root = ''
            # next table on same level - reset header 
            if len(table) == len(next_table):
                header = root
            content.extend(
                parse_table(
                    next_table,
                    header=header,
                    root=root,
                    table_len=table_len
                )
            )
    
        return content
    

    Output:

    >>> parse_table(table)
    ['Something to watch or idk',
     'Some header with subheader First',
     'Some header with subheader another text again with one more line and this',
     'Some header with subheader another text again with one more line how it be']
    >>> parse_table(big_table)
    ['Number string',
     'longtextveryveryloooooong',
     'aaaaaaaaaaa',
     'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text all',
     'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text longtext wit- h many character',
     'dfsdfgsdfddd affasdd  as seeerrrr   e, percent',
     'qqqqqqqqqqqqqqqqqqqqqq qqqqqqqqqqqqqqqqqqqqqq dfsdfffffffffffff ttttttttttttttttt sssssssssssssssssssss zzzzzzzzzzzzzzzzzzzzzz b rererereerr ppppppp']
    >>> parse_table(planets)
    ['Planets Planet Sun (Solar) Earth Moon Mars',
     'Planets R (km) 696000 6371 1737 3390',
     'Planets mass (x 10^29 kg) 1989100000 5973.6 73.5 641.85']