pythonxmldataframelxml

XML and probably LXML issue


I have many XML files that look like this

<?xml version="1.0" encoding="utf-8" standalone="no"?>
<reiXmlPrenos>
  <Qfl>1808</Qfl>
  <fOVE>13.7</fOVE>
  <NetoVolumen>613</NetoVolumen>
  <Hv>104.2</Hv>
  <energenti>
    <energent>
      <sifra>energy_e</sifra>
      <naziv>EE [kWh]</naziv>
      <vrednost>238981</vrednost>
    </energent>
    <energent>
      <sifra>energy_to</sifra>
      <naziv>Do</naziv>
      <vrednost>16359</vrednost>
    </energent>
    <energent>
      <sifra>energy_en</sifra>
      <naziv>En</naziv>
      <vrednost>0</vrednost>
    </energent>
</energenti>
  <rei>
    <zavetrovanost>2</zavetrovanost>
    <cone>
      <cona>
        <cona_id>1</cona_id>
        <cc_si_cona>1110000</cc_si_cona>
        <visina_cone>2.7</visina_cone>
        <dolzina_cone>14</dolzina_cone>
      </cona>
      <cona>
        <cona_id>2</cona_id>
        <cc_si_cona>120000</cc_si_cona>
      </cona>
  </rei>
</reiXmlPrenos>

I would like to extract certain values from those XML files. So I put together with the help from people here code below that is suppose to work:

import pandas as pd
import glob
import os
from lxml import etree


os.chdir(r'R:\...\XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path


#note: For simplicity, I'm using the well formed version of the xml strings in your question; you'll have to use actual file names and paths
energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
#I just made up some names - you'll have to use actual names, of course; the first one is for the file identifier - see below
rows = []
for xml in xmls:
    row = []
    id = "xml-"+str(xmls.index(xml)+1)
    #this creates the file identifier
    row.append(id)
    root = etree.XML(xml.encode())
    #in real life, you'll have to use the parse() method
    
    for energy in energies[1:]:
        #the '[1:]' is used to skip the first "energy"; it's only used as the file identifier
        target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
        #note the use of f-strings
        row.extend( target if len(target)>0 else "0" )
    rows.append(row)

print(pd.DataFrame(rows,columns=energies))

But in the end I get a warning:

File "<string>", line 1
XMLSyntaxError: Start tag expected, '<' not found, line 1, column 1

Is this an XML issue? Or maybe lxml issue? Does anyone know how to approach this?

Ideally, the result would look like this

xml       energy_e   energy_en   energy_to
xml-1    238981      0         16539 
xml-2    ...         ..        .. 

Solution

  • import pandas as pd
    import glob
    import os
    from lxml import etree
    
    
    os.chdir(r'R:\...\XML-1122_test')
    dir_path = glob.glob('*.xml')
    xmls = dir_path
    
    energies = ["xml", "energy_e", "energy_en", "energy_to"]
    rows = []
    for xml in xmls:
        row = []
        id = "xml-"+str(xmls.index(xml)+1)
        row.append(id)
        with open(xml, 'r', encoding='utf-8') as f:
            xml_string = f.read()
        root = etree.XML(xml_string.encode())
        
        for energy in energies[1:]:
            target = root.xpath(f'//energent[./sifra="{energy}"]/vrednost/text()')
            row.extend(target if len(target)>0 else ["0"])
        rows.append(row)
    
    print(pd.DataFrame(rows, columns=energies)
    

    parse below

    import pandas as pd
    import glob
    import os
    from lxml import etree
    
    
    os.chdir(r'R:\...\XML-1122_test')
    dir_path = glob.glob('*.xml')
    xmls = dir_path
    
    
    energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
    rows = []
    for xml in xmls:
        row = []
        id = "xml-"+str(xmls.index(xml)+1)
        row.append(id)
        root = etree.parse(xml)
        
        for energy in energies[1:]:
            target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
            row.extend( target if len(target)>0 else "0" )
        rows.append(row)
    
    print(pd.DataFrame(rows,columns=energies))