pythonbeautifulsouplxml

How to split an HTML string around tags?


How can I use Python to split an html string by a specified unpaired tag? For example

split('hello<br    >there', 'br')

should return ['hello', 'there'],

split('<div id="d71">text1<br data-i="1">text2<br>text3</div>', 'br')

should return ['<div id="d71">text1', 'text2', 'text3</div>'] or ['<div id="d71">text1</div>', '<div id="d71">text2</div>', '<div id="d71">text3</div>']

I've had a look at

def get_start_stop(source, tag_name):
    soup = BeautifulSoup(source, 'html.parser')
    return dir(soup.find(tag_name))

But the things I was hopeful about, sourcepos, string, strings, self_and_descendants, .nextSibling.sourcepos don't have the information necessary (as far as I can tell) to get the start and end indexes of the tag in the source string.

I've also tried things like

from lxml import html

def split(input_str, tag_name):
    tree = html.fromstring(input_str)
    output_list = []
    for element in tree.iter():
        if element.tag == tag_name:
            output_list.append(element.tail)
        else:
            output_list.append(html.tostring(element, encoding='unicode', with_tail=False))
    return output_list

but with_tail=False doesn't do what I expect


Solution

  • Find the element you want to split on, delete all previous elements and the element itself. This will give you the right side of the split result.

    Do the same thing (but delete all next elements) to get the left side.

    from itertools import chain
    
    from bs4 import BeautifulSoup
    
    
    def get_right(soup: BeautifulSoup, tag_name: str) -> BeautifulSoup:
        right = BeautifulSoup(str(soup), 'html.parser')
        right_tag = right.find(tag_name)
        if right_tag is not None:
            for parent in chain([right_tag], list(right_tag.parents)):
                for sibling in list(parent.previous_siblings):
                    sibling.extract()
            right_tag.extract()
        else:
            return BeautifulSoup('', 'html.parser')
        return right
    
    
    def get_left(soup: BeautifulSoup, tag_name: str) -> BeautifulSoup:
        left = BeautifulSoup(str(soup), 'html.parser')
        left_tag = left.find(tag_name)
        if left_tag is not None:
            for parent in chain([left_tag], list(left_tag.parents)):
                for sibling in list(parent.next_siblings):
                    sibling.extract()
            left_tag.extract()
        return left
    
    
    def split(input_str: str, tag_name: str) -> list[str]:
        right = BeautifulSoup(input_str, 'html.parser')
        output = []
        while len(right):
            left = get_left(right, tag_name)
            output.append(str(left))
            right = get_right(right, tag_name)
        return output
    
    >>> split('hello<br    >there', 'br')
    ['hello', 'there']
    >>> split('start<div id="d71">text1<br data-i="1">text2<br>text3</div>end', 'br')
    ['start<div id="d71">text1</div>', '<div id="d71">text2</div>', '<div id="d71">text3</div>end']
    >>> split('<br>there<br>', 'br')
    ['', 'there', '']
    >>> split('there', 'br')
    ['there']