pythonparsing

How can I efficiently parse HTML in Python?


I want to parse the HTML code efficiently without an external library.

I have already tried with a for loop which checks which symbol it is:

list = []
html = """<html><p>Hello</p></html>"""
m = 0
for a in html:
    if a == "<":
        m = 1
        list.append([])
    elif a == ">":
        m = 0
        list.append([])
    else:
        list[-1] = a
print(list)

But the code was very slow on 50KB files.


Solution

  • May I recommend starting with a simple HTML parser like the one shown below? It uses the standard library that comes with Python and has no external dependencies. You may need to alter and extend it according to your needs, but it gives you a basic DOM API that should be a good beginning point to work from. The code works for the simple case it is meant to tackle; but depending on your needs, you may need to add further functionality to accomplish whatever your end goal may be.

    #! /usr/bin/env python3
    import html.parser
    import pprint
    import xml.dom.minidom
    
    
    def main():
        # noinspection PyPep8
        document = '''
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title"><b>The Dormouse's story</b></p>
    
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    
    <p class="story">...</p>
    '''
        parser = DocumentParser()
        parser.feed(document)
        parser.close()
        model = parser.document.documentElement
        model.normalize()
        print(model.toprettyxml())
        first_title = model.getElementsByTagName('title')[0]
        print(first_title.toxml())
        print(first_title.tagName)
        print(first_title.firstChild.data)
        print(first_title.parentNode.tagName)
        first_p = model.getElementsByTagName('p')[0]
        print(first_p.toxml())
        print(first_p.getAttribute('class'))
        all_a = model.getElementsByTagName('a')
        print(all_a[0].toxml())
        pprint.pprint([element.toxml() for element in all_a])
        pprint.pprint([element.toxml() for element in find(model, id='link3')])
        for element in all_a:
            print(element.getAttribute('href'))
        print(*get_text(model), sep='\n')
    
    
    class DocumentParser(html.parser.HTMLParser):
        # noinspection SpellCheckingInspection
        def __init__(self, *, convert_charrefs=True):
            super().__init__(convert_charrefs=convert_charrefs)
            self.document = self.focus = xml.dom.minidom.DOMImplementation() \
                .createDocument(None, None, None)
    
        @property
        def document_has_focus(self):
            return self.document is self.focus
    
        def handle_starttag(self, tag, attrs):
            element = self.document.createElement(tag)
            for name, value in attrs:
                element.setAttribute(name, value)
            self.focus.appendChild(element)
            self.focus = element
    
        def handle_endtag(self, tag):
            while self.focus.tagName != tag:
                self.focus = self.focus.parentNode
            self.focus = self.focus.parentNode
    
        def handle_data(self, data):
            if not self.document_has_focus and not data.isspace():
                self.focus.appendChild(self.document.createTextNode(data.strip()))
    
        def error(self, message):
            raise RuntimeError(message)
    
        def close(self):
            super().close()
            while not self.document_has_focus:
                self.focus = self.focus.parentNode
    
    
    def find(element, **kwargs):
        get_attribute = getattr(element, 'getAttribute', None)
        if get_attribute and \
                all(get_attribute(key) == value for key, value in kwargs.items()):
            yield element
        for child in element.childNodes:
            yield from find(child, **kwargs)
    
    
    def get_nodes_by_type(node, node_type):
        if node.nodeType == node_type:
            yield node
        for child in node.childNodes:
            yield from get_nodes_by_type(child, node_type)
    
    
    def get_text(node):
        return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))
    
    
    if __name__ == '__main__':
        main()