How can I use Python to split an html string by a specified unpaired tag? For example
split('hello<br >there', 'br')
should return ['hello', 'there']
,
split('<div id="d71">text1<br data-i="1">text2<br>text3</div>', 'br')
should return ['<div id="d71">text1', 'text2', 'text3</div>']
or ['<div id="d71">text1</div>', '<div id="d71">text2</div>', '<div id="d71">text3</div>']
I've had a look at
def get_start_stop(source, tag_name):
soup = BeautifulSoup(source, 'html.parser')
return dir(soup.find(tag_name))
But the things I was hopeful about, sourcepos
, string
, strings
, self_and_descendants
, .nextSibling.sourcepos
don't have the information necessary (as far as I can tell) to get the start and end indexes of the tag in the source string.
I've also tried things like
from lxml import html
def split(input_str, tag_name):
tree = html.fromstring(input_str)
output_list = []
for element in tree.iter():
if element.tag == tag_name:
output_list.append(element.tail)
else:
output_list.append(html.tostring(element, encoding='unicode', with_tail=False))
return output_list
but with_tail=False
doesn't do what I expect
Find the element you want to split on, delete all previous elements and the element itself. This will give you the right side of the split result.
Do the same thing (but delete all next elements) to get the left side.
from itertools import chain
from bs4 import BeautifulSoup
def get_right(soup: BeautifulSoup, tag_name: str) -> BeautifulSoup:
right = BeautifulSoup(str(soup), 'html.parser')
right_tag = right.find(tag_name)
if right_tag is not None:
for parent in chain([right_tag], list(right_tag.parents)):
for sibling in list(parent.previous_siblings):
sibling.extract()
right_tag.extract()
else:
return BeautifulSoup('', 'html.parser')
return right
def get_left(soup: BeautifulSoup, tag_name: str) -> BeautifulSoup:
left = BeautifulSoup(str(soup), 'html.parser')
left_tag = left.find(tag_name)
if left_tag is not None:
for parent in chain([left_tag], list(left_tag.parents)):
for sibling in list(parent.next_siblings):
sibling.extract()
left_tag.extract()
return left
def split(input_str: str, tag_name: str) -> list[str]:
right = BeautifulSoup(input_str, 'html.parser')
output = []
while len(right):
left = get_left(right, tag_name)
output.append(str(left))
right = get_right(right, tag_name)
return output
>>> split('hello<br >there', 'br')
['hello', 'there']
>>> split('start<div id="d71">text1<br data-i="1">text2<br>text3</div>end', 'br')
['start<div id="d71">text1</div>', '<div id="d71">text2</div>', '<div id="d71">text3</div>end']
>>> split('<br>there<br>', 'br')
['', 'there', '']
>>> split('there', 'br')
['there']