I have many XML files that look like this
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<reiXmlPrenos>
<Qfl>1808</Qfl>
<fOVE>13.7</fOVE>
<NetoVolumen>613</NetoVolumen>
<Hv>104.2</Hv>
<energenti>
<energent>
<sifra>energy_e</sifra>
<naziv>EE [kWh]</naziv>
<vrednost>238981</vrednost>
</energent>
<energent>
<sifra>energy_to</sifra>
<naziv>Do</naziv>
<vrednost>16359</vrednost>
</energent>
<energent>
<sifra>energy_en</sifra>
<naziv>En</naziv>
<vrednost>0</vrednost>
</energent>
</energenti>
<rei>
<zavetrovanost>2</zavetrovanost>
<cone>
<cona>
<cona_id>1</cona_id>
<cc_si_cona>1110000</cc_si_cona>
<visina_cone>2.7</visina_cone>
<dolzina_cone>14</dolzina_cone>
</cona>
<cona>
<cona_id>2</cona_id>
<cc_si_cona>120000</cc_si_cona>
</cona>
</rei>
</reiXmlPrenos>
I would like to extract certain values from those XML files. So I put together with the help from people here code below that is suppose to work:
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:\...\XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
#note: For simplicity, I'm using the well formed version of the xml strings in your question; you'll have to use actual file names and paths
energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
#I just made up some names - you'll have to use actual names, of course; the first one is for the file identifier - see below
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
#this creates the file identifier
row.append(id)
root = etree.XML(xml.encode())
#in real life, you'll have to use the parse() method
for energy in energies[1:]:
#the '[1:]' is used to skip the first "energy"; it's only used as the file identifier
target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
#note the use of f-strings
row.extend( target if len(target)>0 else "0" )
rows.append(row)
print(pd.DataFrame(rows,columns=energies))
But in the end I get a warning:
File "<string>", line 1
XMLSyntaxError: Start tag expected, '<' not found, line 1, column 1
Is this an XML issue? Or maybe lxml issue? Does anyone know how to approach this?
Ideally, the result would look like this
xml energy_e energy_en energy_to
xml-1 238981 0 16539
xml-2 ... .. ..
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:\...\XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
energies = ["xml", "energy_e", "energy_en", "energy_to"]
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
row.append(id)
with open(xml, 'r', encoding='utf-8') as f:
xml_string = f.read()
root = etree.XML(xml_string.encode())
for energy in energies[1:]:
target = root.xpath(f'//energent[./sifra="{energy}"]/vrednost/text()')
row.extend(target if len(target)>0 else ["0"])
rows.append(row)
print(pd.DataFrame(rows, columns=energies)
parse below
import pandas as pd
import glob
import os
from lxml import etree
os.chdir(r'R:\...\XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
row.append(id)
root = etree.parse(xml)
for energy in energies[1:]:
target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
row.extend( target if len(target)>0 else "0" )
rows.append(row)
print(pd.DataFrame(rows,columns=energies))