So I need to parse this into dataframe or list:
tmp =
['+--------------+-----------------------------------------+',
'| Something to | Some header with subheader |',
'| watch or +-----------------+-----------------------+',
'| idk | First | another text again |',
'| | | with one more line |',
'| | +-----------------------+',
'| | | and this | how it be |',
'+--------------+-----------------+-----------------------+']
It is just txt table with strange header. I need to transform it to this:
['Something to watch or idk', 'Some header with subheader First', 'Some header with subheader another text again with one more line and this', 'Some header with subheader another text again with one more line how it be']
Here's my first solution that make me closer to victory (you can see the comments my tries):
pluses = [i for i, element in enumerate(tmp) if element[0] == '+']
tmp2 = tmp[pluses[0]:pluses[1]+1].copy()
table_str=''.join(tmp[pluses[0]:pluses[1]+1])
col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]
tmp3=[]
strt = ''.join(tmp2.copy())
table_list = [l.strip().replace('\n', '') for l in re.split(r'\+[+-]+', strt) if l.strip()]
for row in table_list:
joined_row = ['' for _ in range(len(row))]
for lines in [line for line in row.split('||')]:
line_part = [i.strip() for i in lines.split('|') if i]
joined_row = [i + j for i, j in zip(joined_row, line_part)]
tmp3.append(joined_row)
here's out:
tmp3
out[4]:
[['Something to', 'Some header with subheader'],
['Something towatch or'],
['idk', 'First', 'another text again'],
['idk', 'First', 'another text againwith one more line'],
['idk'],
['', '', 'and this', 'how it be']]
Remains only join this in the right way but idk how to...
Here's addon: We can locate pluses and splitters by this:
col=[[i for i, symbol in enumerate(line) if symbol == '+' or symbol == '|'] for line in tmp2]
[[0, 15, 57],
[0, 15, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 57],
[0, 15, 33, 45, 57],
[0, 15, 33, 57]]
And then we can split or group by cell but idk how to too... Please help
Example No.2:
+----------+------------------------------------------------------------+---------------+----------------------------------+--------------------+-----------------------+
| Number | longtextveryveryloooooong | aaaaaaaaaaa | bbbbbbbbbbbbbbbbbb | dfsdfgsdfddd |qqqqqqqqqqqqqqqqqqqqqq |
| string | | | ccccccccccccccccccccc | affasdd as |qqqqqqqqqqqqqqqqqqqqqq |
| | | | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,| seeerrrr e, | dfsdfffffffffffff |
| | | | anothertext and something | percent | ttttttttttttttttt |
| | | | (nothingtodo), | | sssssssssssssssssssss |
| | | | and text | |zzzzzzzzzzzzzzzzzzzzzz |
| | | +----------------------------------+ | b rererereerr ppppppp |
| | | | all | longtext wit- | | |
| | | | |h many character| | |
+----------+------------------------------------------------------------+---------------+-----------------+----------------+--------------------+-----------------------+
You could do it recursively - parsing each "sub table" at a time:
def parse_table(table, header='', root='', table_len=None):
# store length of original table
if not table_len:
table_len = len(table)
# end of current "column"
col = table[0].find('+', 1)
rows = [
row for row in range(1, len(table))
if table[row].startswith('+')
and table[row][col] == '+'
]
row = rows[0]
# split "line" contents into columns
# end of "line" is either `+` or final `|`
end = col
num_cols = table[0].count('+')
if num_cols != table[1].count('|'):
end = table[1].rfind('|')
columns = (line[1:end].split('|') for line in table[1:row])
# rebuild each column appending to header
content = [
' '.join([header] + [line.strip() for line in lines]).strip()
for lines in zip(*columns)
]
# is there a table below?
if row + 2 < len(table):
header = content[-1]
# if we are not the last table - we are a header
if len(rows) > 1:
header = content.pop()
# if we are the first table in column - we are the root
if not root:
root = header
next_table = [line[:col + 1] for line in table[row:]]
content.extend(
parse_table(
next_table,
header=header,
root=root,
table_len=table_len
)
)
# is there a table to the right?
if col + 2 < len(table[0]):
# find start line of next table
row = next(
row for row, line in enumerate(table, start=-1)
if line[col] == '|'
)
next_table = [line[col:] for line in table[row:]]
# new top-level table - reset root
if len(next_table) == table_len:
root = ''
# next table on same level - reset header
if len(table) == len(next_table):
header = root
content.extend(
parse_table(
next_table,
header=header,
root=root,
table_len=table_len
)
)
return content
Output:
>>> parse_table(table)
['Something to watch or idk',
'Some header with subheader First',
'Some header with subheader another text again with one more line and this',
'Some header with subheader another text again with one more line how it be']
>>> parse_table(big_table)
['Number string',
'longtextveryveryloooooong',
'aaaaaaaaaaa',
'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text all',
'bbbbbbbbbbbbbbbbbb ccccccccccccccccccccc eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee, anothertext and something (nothingtodo), and text longtext wit- h many character',
'dfsdfgsdfddd affasdd as seeerrrr e, percent',
'qqqqqqqqqqqqqqqqqqqqqq qqqqqqqqqqqqqqqqqqqqqq dfsdfffffffffffff ttttttttttttttttt sssssssssssssssssssss zzzzzzzzzzzzzzzzzzzzzz b rererereerr ppppppp']
>>> parse_table(planets)
['Planets Planet Sun (Solar) Earth Moon Mars',
'Planets R (km) 696000 6371 1737 3390',
'Planets mass (x 10^29 kg) 1989100000 5973.6 73.5 641.85']