pythonlxmldoctypexml-declaration

Preserving original doctype and declaration of an lxml.etree parsed xml


I'm using python's lxml and I'm trying to read an xml document, modify and write it back but the original doctype and xml declaration disappears. I'm wondering if there's an easy way of putting it back in whether through lxml or some other solution?


Solution

  • tl;dr

    # adds declaration with version and encoding regardless of
    # which attributes were present in the original declaration
    # expects utf-8 encoding (encode/decode calls)
    # depending on your needs you might want to improve that
    from lxml import etree
    from xml.dom.minidom import parseString
    xml1 = '''\
    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE root SYSTEM "example.dtd">
    <root>...</root>
    '''
    xml2 = '''\
    <root>...</root>
    '''
    def has_xml_declaration(xml):
        return parseString(xml).version
    def process(xml):
        t = etree.fromstring(xml.encode()).getroottree()
        if has_xml_declaration(xml):
            print(etree.tostring(t, xml_declaration=True, encoding=t.docinfo.encoding).decode())
        else:
            print(etree.tostring(t).decode())
    process(xml1)
    process(xml2)
    

    The following will include the DOCTYPE and the XML declaration:

    from lxml import etree
    from StringIO import StringIO
    
    tree = etree.parse(StringIO('''<?xml version="1.0" encoding="iso-8859-1"?>
     <!DOCTYPE root SYSTEM "test" [ <!ENTITY tasty "eggs"> ]>
      <root>
       <a>&tasty;</a>
     </root>
    '''))
    
    docinfo = tree.docinfo
    print etree.tostring(tree, xml_declaration=True, encoding=docinfo.encoding)
    

    Note, tostring does not preserve the DOCTYPE if you create an Element (e.g. using fromstring), it only works when you process the XML using parse.

    Update: as pointed out by J.F. Sebastian my assertion about fromstring is not true.

    Here is some code to highlight the differences between Element and ElementTree serialization:

    from lxml import etree
    from StringIO import StringIO
    
    xml_str = '''<?xml version="1.0" encoding="iso-8859-1"?>
     <!DOCTYPE root SYSTEM "test" [ <!ENTITY tasty "eggs"> ]>
      <root>
       <a>&tasty;</a>
     </root>
    '''
    
    # get the ElementTree using parse
    parse_tree = etree.parse(StringIO(xml_str))
    encoding = parse_tree.docinfo.encoding
    result = etree.tostring(parse_tree, xml_declaration=True, encoding=encoding)
    print "%s\nparse ElementTree:\n%s\n" % ('-'*20, result)
    
    # get the ElementTree using fromstring
    fromstring_tree = etree.fromstring(xml_str).getroottree()
    encoding = fromstring_tree.docinfo.encoding
    result = etree.tostring(fromstring_tree, xml_declaration=True, encoding=encoding)
    print "%s\nfromstring ElementTree:\n%s\n" % ('-'*20, result)
    
    # DOCTYPE is lost, and no access to encoding
    fromstring_element = etree.fromstring(xml_str)
    result = etree.tostring(fromstring_element, xml_declaration=True)
    print "%s\nfromstring Element:\n%s\n" % ('-'*20, result)
    

    and the output is:

    --------------------
    parse ElementTree:
    <?xml version='1.0' encoding='iso-8859-1'?>
    <!DOCTYPE root SYSTEM "test" [
    <!ENTITY tasty "eggs">
    ]>
    <root>
       <a>eggs</a>
     </root>
    
    --------------------
    fromstring ElementTree:
    <?xml version='1.0' encoding='iso-8859-1'?>
    <!DOCTYPE root SYSTEM "test" [
    <!ENTITY tasty "eggs">
    ]>
    <root>
       <a>eggs</a>
     </root>
    
    --------------------
    fromstring Element:
    <?xml version='1.0' encoding='ASCII'?>
    <root>
       <a>eggs</a>
     </root>