I am attempting to parse product data from icecat. The data comes in large xml files. (3-7gb).
In order to reduce the amount of product data I am bringing in, I need to filter this list before moving to my next step. Particularly I need to filter by the "Updated" and "On_Market" values in the "file" element, and IF the "Country_Markets" child exists, I need to see if that (possible) list of children contains <"Country_Market = "US">
I am able to get the On_Market filter to work, but can't figure out how to structure the check of the Country_Markets child, or implement the date filter.
Where I am currently:
PYTHON:
from xml.etree.ElementTree import iterparse
file_path = 'index.xml'
dict_list = []
date = "20220803143328"
for _, elem in iterparse(file_path, events=("end",)):
for child in elem:
print(child)
if child.tag == 'Country_Markets':
if child.attrib['Country_Market'] == "US" OR child['Country_Markets'] is None:
if elem.tag == "file":
if elem.attrib['On_Market'] == "1":
if elem.attrib['Updated']>= date:
dict_list.append({'IceId': elem.attrib['Product_ID'],
'LastUpdate': elem.attrib['Updated'],
'PartNum': elem.attrib['Prod_ID'],
'OnMarket': elem.attrib['On_Market']})
elem.clear()
df = pd.DataFrame(dict_list)
EDIT NEW PYTHON APPROACH
I am trying to implement a different package which seems to do the trick, except I haven't been able to figure out how to incorporate the EAN_UPCS into my output...
from lxml import etree
context = etree.iterparse(file_path, events=("start", "end"),)
for event, elem in context:
if elem.tag == 'file':
for child1 in elem:
if child1.tag == 'Country_Markets':
for child2 in child1:
if child2.attrib['Value'] == "US":
if elem.attrib['On_Market'] == "1":
if elem.attrib['Updated']>= "20220803143328":
print(f"'IceId': {elem.attrib['Product_ID']}")
print(f"'LastUpdate': {elem.attrib['Updated']}")
print(f"'PartNum': {elem.attrib['Prod_ID']}")
print(f"'OnMarket': {elem.attrib['On_Market']}")
if child1.tag == 'EAN_UPCS':
for child2 in child1:
if child2.attrib['IsApproved'] == "1":
print(child2.attrib['Value'])
EDIT 2:
I can get the result I am after if I run segments of the loop, but once I run the entire loop, I either lose information, or append all information.
dict_list = []
context = etree.iterparse(file_path, events=("end",))
for event, elem in context:
if elem.tag == 'file':
id = elem.attrib['Product_ID']
print(id)
valid = "no"
dist = ()
last_date = ()
market = ()
upc_list = []
if elem.attrib['On_Market'] == "1":
market = "yes"
if elem.attrib['Updated']>= "20170803143328":
last_date = "yes"
for child1 in elem:
if child1.tag =='Country_Markets':
print('markets found')
for child2 in child1:
if child2.attrib['Value'] == "US":
dist = "yes"
else:
if dist != "yes":
dist = "no"
#had to come up with a way to not overwrite the distibution while iterating throug elements
elif elem.find("Country_Markets") is None:
print("No Markets")
dist = "yes"
if child1.tag == 'EAN_UPCS':
for child2 in child1:
if child2.attrib['IsApproved'] == "1":
upc_list.append(child2.attrib['Value'])
print(id, dist, last_date, market, upc_list)
if dist == "yes" and last_date == "yes" and market == "yes":
dict_list.append({elem.attrib['Product_ID']:{'PartNum':elem.attrib['Prod_ID'], 'Updated' : elem.attrib['Updated'], 'UPCs': upc_list}} )
continue
elem.clear()
del context
dict_list
XML:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ICECAT-interface SYSTEM "https://data.icecat.biz/dtd/files.index.dtd">
<!--source: Icecat.biz 2022-->
<ICECAT-interface xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://data.icecat.biz/xsd/files.index.xsd">
<files.index Generated="20220930000002">
<file path="export/level4/US/1402.xml" Limited="No" HighPic="https://images.icecat.biz/img/gallery/1402_9631004284.jpg" HighPicSize="744457" HighPicWidth="2670" HighPicHeight="1407" Product_ID="1402" Updated="20220212085227" Quality="ICECAT" Prod_ID="C4893A" Supplier_id="1" Catid="377" On_Market="1" Model_Name="80 Value Pack 350-ml Yellow DesignJet Ink Cartridge and Printhead" Product_View="92380" Date_Added="20051028000000">
<Country_Markets>
<Country_Market Value="BE"/>
<Country_Market Value="FR"/>
<Country_Market Value="US"/>
<Country_Market Value="GB"/>
<Country_Market Value="DE"/>
<Country_Market Value="CH"/>
<Country_Market Value="IT"/>
<Country_Market Value="CA"/>
</Country_Markets>
</file>
<file path="export/level4/US/1414.xml" Limited="No" HighPic="https://images.icecat.biz/img/norm/high/1414-HP.jpg" HighPicSize="43288" HighPicWidth="400" HighPicHeight="400" Product_ID="1414" Updated="20220711134129" Quality="ICECAT" Prod_ID="C6614NE" Supplier_id="1" Catid="377" On_Market="1" Model_Name="C6614NE" Product_View="98879" Date_Added="20051023000000">
</file>
<file path="export/level4/US/1415.xml" Limited="No" HighPic="https://images.icecat.biz/img/norm/high/1415-HP.jpg" HighPicSize="43235" HighPicWidth="400" HighPicHeight="400" Product_ID="1415" Updated="20190404035203" Quality="ICECAT" Prod_ID="51650CE" Supplier_id="1" Catid="377" On_Market="1" Model_Name="50 Cyan Inkjet Print Cartridge" Product_View="60706" Date_Added="20051023000000">
<EAN_UPCS>
<EAN_UPC Value="0088698200223" IsApproved="0" Format="GTIN-13"/>
<EAN_UPC Value="088698200223" IsApproved="0" Format="GTIN-12"/>
</EAN_UPCS>
<Country_Markets>
<Country_Market Value="BE"/>
<Country_Market Value="DE"/>
<Country_Market Value="IT"/>
<Country_Market Value="UA"/>
<Country_Market Value="DZ"/>
</Country_Markets>
</file>
<file path="export/level4/US/7966778.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966778-4280.jpg" HighPicSize="814349" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966778" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC2G79-3X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 6GB (2GB x 3)" Product_View="7328" Date_Added="20110223000000">
<EAN_UPCS>
<EAN_UPC Value="4713435791172" IsApproved="1" Format="GTIN-13"/>
</EAN_UPCS>
<Country_Markets>
<Country_Market Value="US"/>
</Country_Markets>
</file>
<file path="export/level4/US/7966779.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966778-4280.jpg" HighPicSize="793195" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966779" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC4G79-3X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 6GB (2GB x 3)" Product_View="6515" Date_Added="20110223000000">
<EAN_UPCS>
<EAN_UPC Value="4713435791714" IsApproved="1" Format="GTIN-13"/>
</EAN_UPCS>
<Country_Markets>
<Country_Market Value="LU"/>
<Country_Market Value="CH"/>
</Country_Markets>
</file>
<file path="export/level4/US/7966780.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966780-2331.jpg" HighPicSize="724700" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966780" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC4G79-2X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 8GB (4GB x 2)" Product_View="6902" Date_Added="20110223000000">
<EAN_UPCS>
<EAN_UPC Value="4713435791707" IsApproved="1" Format="GTIN-13"/>
</EAN_UPCS>
<Country_Markets>
<Country_Market Value="LU"/>
<Country_Market Value="CH"/>
</Country_Markets>
</file>
</files.index>
</ICECAT-interface>
The goal is to be able to filter out any items that are older than a certain date, not on market, and explicitly not for sale in the US. (Items that don't have Country_Markets elements are to be included in the output.)
Consider this adjusted iterparse
approach without the inner loops across child elements. Below uses flag variables to be turned on and off conditionally while walking down the tree. A dictionary of data is built and iteratively appended within <file>...</file>
context.
from lxml import etree
#from xml.etree import ElementTree as etree
file_path = "Input.xml"
files_data = []
context = etree.iterparse(file_path, events=("start", "end"),)
for event, elem in context:
if event == "start":
if elem.tag == 'file':
data = {}
US_Flag = 0
Date_Flag = 0
if elem.attrib['Updated'] >= "20220803143328":
Date_Flag = 1
data['IceId'] = elem.attrib['Product_ID']
data['LastUpdate'] = elem.attrib['Updated']
data['PartNum'] = elem.attrib['Prod_ID']
data['OnMarket'] = elem.attrib['On_Market']
data['EAN_UPCS'] = []
if elem.tag == "EAN_UPC" and Date_Flag == 1:
if elem.attrib['IsApproved'] == "1":
data['EAN_UPCS'].append(elem.attrib['Value'])
if elem.tag == "Country_Market" and Date_Flag == 1:
data['Country_Market'] = "US"
if elem.attrib["Value"] == "US":
US_Flag = 1
if event == "end":
if elem.tag == "file" and (US_Flag == 1 or "Country_Market" not in data):
files_data.append(data)
elem.clear()
Output
For demonstration, below results remove >= "20220803143328"
condition since posted XML does not contain dates in that range
from pprint import pprint
pprint(files_data)
[{'Country_Market': 'US',
'EAN_UPCS': [],
'IceId': '1402',
'LastUpdate': '20220212085227',
'OnMarket': '1',
'PartNum': 'C4893A'},
{'EAN_UPCS': [],
'IceId': '1414',
'LastUpdate': '20220711134129',
'OnMarket': '1',
'PartNum': 'C6614NE'},
{'Country_Market': 'US',
'EAN_UPCS': ['4713435791172'],
'IceId': '7966778',
'LastUpdate': '20201106094740',
'OnMarket': '1',
'PartNum': 'AX3U1600XC2G79-3X'}]