I want to pretty print HTML while keeping <tr> children </tr>
in 1 line. HTML STRING looks like html = '''<html><body><h1>hello world</h1><table><tr><td>1 STRING</td><td>2 STRING</td><td>3 STRING</td></tr></table></body></html>'''
I tried to solve the problem with bs4's prettify, but it isn't giving correct result.
from bs4 import BeautifulSoup
# Original HTML string
html = '''<html><body><h1>hello world</h1><table><tr><td>1 STRING</td><td>2 STRING</td><td>3 STRING</td></tr></table></body></html>'''
</tr></table></body></html>'''
soup = BeautifulSoup(html, 'html.parser')
prettified_html = soup.prettify()
for tr in soup.find_all('tr'):
inline_tr = f"<tr>{''.join(str(td) for td in tr.find_all('td'))}</tr>"
prettified_html = prettified_html.replace(str(tr), inline_tr)
print(prettified_html)
Ouptut:
<html>
<body>
<h1>
hello world
</h1>
<table>
<tr>
<td>
1 STRING
</td>
<td>
2 STRING
</td>
<td>
3 STRING
</td>
</tr>
</table>
</body>
</html>
Wanted output:
<!-- HTML-->
<tr><td>1 STRING</td><td>2 STRING</td><td>3 STRING</td></tr>
<!--HTML-->
I'm open to using any python packages to solve the problem.
Just use regex to remove the '/n' between the <tr>
and </tr>
tags
from bs4 import BeautifulSoup
import re
# Original HTML string
html = '''<html><body><h1>hello world</h1><table><tr><td>1 STRING</td><td>2 STRING</td><td>3 STRING</td></tr></table></body></html>
</tr></table></body></html>'''
soup = BeautifulSoup(html, 'html.parser')
prettified_html = soup.prettify()
def remove_newlines_in_tr(match):
tr_content = match.group(0)
lines = tr_content.split('\n')
lines = [line.strip() for line in lines]
tr_content = ''.join(lines)
return tr_content
pattern = re.compile(r'<tr>.*?</tr>', re.DOTALL)
html_inline_tr = pattern.sub(remove_newlines_in_tr, prettified_html)
print(html_inline_tr)