python parsing ascii python-re prettytable

Parse ascii table header

So I need to parse this into dataframe or list:

tmp =
['+--------------+-----------------------------------------+',
 '| Something to |        Some header with subheader       |',
 '|  watch or    +-----------------+-----------------------+',
 '|     idk      |      First      |   another text again  |',
 '|              |                 |  with one more line   |',
 '|              |                 +-----------------------+',
 '|              |                 |  and this | how it be |',
 '+--------------+-----------------+-----------------------+']

It is just txt table with strange header. I need to transform it to this:

['Something to watch or idk', 'Some header with subheader First', 'Some header with subheader another text again with one more line and this', 'Some header with subheader another text again with one more line how it be']

Here's my first solution that make me closer to victory (you can see the comments my tries):

pluses = [i for i, element in enumerate(tmp) if element[0] == '+']
tmp2 = tmp[pluses[0]:pluses[1]+1].copy()
table_str=''.join(tmp[pluses[0]:pluses[1]+1])
col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]

tmp3=[]
strt = ''.join(tmp2.copy())
table_list = [l.strip().replace('\n', '') for l in re.split(r'\+[+-]+', strt) if l.strip()]
for row in table_list:
    joined_row = ['' for _ in range(len(row))]
    for lines in [line for line in row.split('||')]:
        line_part = [i.strip() for i in lines.split('|') if i]
        joined_row = [i + j for i, j in zip(joined_row, line_part)]
        tmp3.append(joined_row)

here's out:

tmp3
out[4]:
[['Something to', 'Some header with subheader'],
 ['Something towatch or'],
 ['idk', 'First', 'another text again'],
 ['idk', 'First', 'another text againwith one more line'],
 ['idk'],
 ['', '', 'and this', 'how it be']]

Remains only join this in the right way but idk how to...

Here's addon: We can locate pluses and splitters by this:

col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]
[[0, 15, 57],
 [0, 15, 57],
 [0, 15, 33, 57],
 [0, 15, 33, 57],
 [0, 15, 33, 57],
 [0, 15, 33, 57],
 [0, 15, 33, 45, 57],
 [0, 15, 33, 57]]

And then we can split or group by cell but idk how to too... Please help

Example No.2:

+----------+------------------------------------------------------------+---------------+----------------------------------+--------------------+-----------------------+
|   Number |       longtextveryveryloooooong                            |  aaaaaaaaaaa  |         bbbbbbbbbbbbbbbbbb       |    dfsdfgsdfddd    |qqqqqqqqqqqqqqqqqqqqqq |
| string   |                                                            |               |        ccccccccccccccccccccc     |    affasdd  as     |qqqqqqqqqqqqqqqqqqqqqq |
|          |                                                            |               | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,|    seeerrrr   e,   |   dfsdfffffffffffff   |
|          |                                                            |               | anothertext and something        |       percent      |   ttttttttttttttttt   |
|          |                                                            |               |           (nothingtodo),         |                    | sssssssssssssssssssss |
|          |                                                            |               |             and text             |                    |zzzzzzzzzzzzzzzzzzzzzz |
|          |                                                            |               +----------------------------------+                    | b rererereerr ppppppp |
|          |                                                            |               |     all         | longtext wit-  |                    |                       |
|          |                                                            |               |                 |h many character|                    |                       |
+----------+------------------------------------------------------------+---------------+-----------------+----------------+--------------------+-----------------------+

Solution

You could do it recursively - parsing each "sub table" at a time:

def parse_table(table, header='', root='', table_len=None):
    # store length of original table
    if not table_len:
        table_len = len(table)

    # end of current "column"
    col = table[0].find('+', 1)
    rows = [
        row for row in range(1, len(table))
            if  table[row].startswith('+')
            and table[row][col] == '+'
    ]
    row = rows[0]

    # split "line" contents into columns
    # end of "line" is either `+` or final `|`
    end = col
    num_cols = table[0].count('+')
    if num_cols != table[1].count('|'):
        end = table[1].rfind('|')
    columns = (line[1:end].split('|') for line in table[1:row])

    # rebuild each column appending to header
    content = [
        ' '.join([header] + [line.strip() for line in lines]).strip()
        for lines in zip(*columns)
    ]

    # is there a table below?
    if row + 2 < len(table):
        header = content[-1]
        # if we are not the last table - we are a header
        if len(rows) > 1:
            header = content.pop()
        # if we are the first table in column - we are the root 
        if not root:
            root = header
        next_table = [line[:col + 1] for line in table[row:]]
        content.extend(
            parse_table(
                next_table,
                header=header,
                root=root,
                table_len=table_len
            )
        )

    # is there a table to the right?
    if col + 2 < len(table[0]):
        # find start line of next table
        row = next(
            row for row, line in enumerate(table, start=-1)
                if line[col] == '|'
        )
        next_table = [line[col:] for line in table[row:]]
        # new top-level table - reset root
        if len(next_table) == table_len:
            root = ''
        # next table on same level - reset header 
        if len(table) == len(next_table):
            header = root
        content.extend(
            parse_table(
                next_table,
                header=header,
                root=root,
                table_len=table_len
            )
        )

    return content

Output:

>>> parse_table(table)
['Something to watch or idk',
 'Some header with subheader First',
 'Some header with subheader another text again with one more line and this',
 'Some header with subheader another text again with one more line how it be']
>>> parse_table(big_table)
['Number string',
 'longtextveryveryloooooong',
 'aaaaaaaaaaa',
 'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text all',
 'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text longtext wit- h many character',
 'dfsdfgsdfddd affasdd  as seeerrrr   e, percent',
 'qqqqqqqqqqqqqqqqqqqqqq qqqqqqqqqqqqqqqqqqqqqq dfsdfffffffffffff ttttttttttttttttt sssssssssssssssssssss zzzzzzzzzzzzzzzzzzzzzz b rererereerr ppppppp']
>>> parse_table(planets)
['Planets Planet Sun (Solar) Earth Moon Mars',
 'Planets R (km) 696000 6371 1737 3390',
 'Planets mass (x 10^29 kg) 1989100000 5973.6 73.5 641.85']