Consider XSLT, the special purpose language designed to transform XML files and can directly convert XML to CSV (i.e., text file) without the pandas dataframe intermediary. Python's third-party module lxml (which you are already using) can run XSLT 1.0 scripts and do so without for loops or if logic. However, due to the complex alignment of product and attributes, some longer XPath searches are used with XSLT.
XSLT (save as .xsl file, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="no" method="text"/>
<xsl:strip-space elements="*"/>
<xsl:param name="delimiter">,</xsl:param>
<xsl:template match="/PropertySet">
<xsl:text>ProductId,Product,AttributeId,Attribute
</xsl:text>
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="PropertySet|Message|ListOf_Class_Def|ListOf_Prod_Def|ImpExp">
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="ListOfObject_Arrt">
<xsl:apply-templates select="Object_Arrt"/>
<xsl:if test="name(*) != 'Object_Arrt' and preceding-sibling::ListOfObject_Def/Object_Def/@Ancestor_Name = ''">
<xsl:value-of select="concat(ancestor::ImpExp/@Name, $delimiter,
ancestor::ImpExp/@Object_Num, $delimiter,
'', $delimiter,
'')"/><xsl:text>
</xsl:text>
</xsl:if>
</xsl:template>
<xsl:template match="Object_Arrt">
<xsl:variable name="attrName" select="ancestor::ImpExp/@Name"/>
<xsl:value-of select="concat(/PropertySet/PropertySet/Message[@IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/@Ancestor_Name = $attrName]/@Name, $delimiter,
/PropertySet/PropertySet/Message[@IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/@Ancestor_Name = $attrName]/@Object_Num, $delimiter,
@Orig_Id, $delimiter,
@Attr_Name)"/><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
Python
import lxml.etree as et
# LOAD XML AND XSL
xml = et.parse('Input.xml')
xsl = et.parse('XSLT_Script.xsl')
# RUN TRANSFORMATION
transform = et.XSLT(xsl)
result = transform(xml)
# OUTPUT TO FILE
with open('Output.csv', 'wb') as f:
f.write(result)
Output
ProductId,Product,AttributeId,Attribute
Laptop,2008a,6666p,LP_Portable
Mouse,2987d,7010p,O_Portable
Mouse,2987d,7012j,O_wireless
Speaker,5463g,,
Answer from Parfait on Stack OverflowVideos
Consider XSLT, the special purpose language designed to transform XML files and can directly convert XML to CSV (i.e., text file) without the pandas dataframe intermediary. Python's third-party module lxml (which you are already using) can run XSLT 1.0 scripts and do so without for loops or if logic. However, due to the complex alignment of product and attributes, some longer XPath searches are used with XSLT.
XSLT (save as .xsl file, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="no" method="text"/>
<xsl:strip-space elements="*"/>
<xsl:param name="delimiter">,</xsl:param>
<xsl:template match="/PropertySet">
<xsl:text>ProductId,Product,AttributeId,Attribute
</xsl:text>
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="PropertySet|Message|ListOf_Class_Def|ListOf_Prod_Def|ImpExp">
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="ListOfObject_Arrt">
<xsl:apply-templates select="Object_Arrt"/>
<xsl:if test="name(*) != 'Object_Arrt' and preceding-sibling::ListOfObject_Def/Object_Def/@Ancestor_Name = ''">
<xsl:value-of select="concat(ancestor::ImpExp/@Name, $delimiter,
ancestor::ImpExp/@Object_Num, $delimiter,
'', $delimiter,
'')"/><xsl:text>
</xsl:text>
</xsl:if>
</xsl:template>
<xsl:template match="Object_Arrt">
<xsl:variable name="attrName" select="ancestor::ImpExp/@Name"/>
<xsl:value-of select="concat(/PropertySet/PropertySet/Message[@IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/@Ancestor_Name = $attrName]/@Name, $delimiter,
/PropertySet/PropertySet/Message[@IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/@Ancestor_Name = $attrName]/@Object_Num, $delimiter,
@Orig_Id, $delimiter,
@Attr_Name)"/><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
Python
import lxml.etree as et
# LOAD XML AND XSL
xml = et.parse('Input.xml')
xsl = et.parse('XSLT_Script.xsl')
# RUN TRANSFORMATION
transform = et.XSLT(xsl)
result = transform(xml)
# OUTPUT TO FILE
with open('Output.csv', 'wb') as f:
f.write(result)
Output
ProductId,Product,AttributeId,Attribute
Laptop,2008a,6666p,LP_Portable
Mouse,2987d,7010p,O_Portable
Mouse,2987d,7012j,O_wireless
Speaker,5463g,,
You would need to preparse all of the CLASS_DEF entries into a dictionary. These can then be looked up when processing the PROD_DEF entries:
import csv
from lxml import etree
inFile = "./newm.xml"
outFile = "./new.csv"
tree = etree.parse(inFile)
class_defs = {}
# First extract all the CLASS_DEF entries into a dictionary
for impexp in tree.iter("ImpExp"):
name = impexp.get('Name')
if impexp.get('Type') == "CLASS_DEF":
for list_of_object_arrt in impexp.findall('ListOfObject_Arrt'):
class_defs[name] = [(obj.get('Orig_Id'), obj.get('Attr_Name')) for obj in list_of_object_arrt]
with open(outFile, 'wb') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['ProductId', 'Product', 'AttributeId', 'Attribute'])
for impexp in tree.iter("ImpExp"):
object_num = impexp.get('Object_Num')
name = impexp.get('Name')
if impexp.get('Type') == "PROD_DEF":
for list_of_object_def in impexp.findall('ListOfObject_Def'):
for obj in list_of_object_def:
ancestor_num = obj.get('Ancestor_Num')
ancestor_name = obj.get('Ancestor_Name')
csv_output.writerow([object_num, name] + list(class_defs.get(ancestor_name, [['', '']])[0]))
This would produce new.csv containing:
ProductId,Product,AttributeId,Attribute
2008a,Laptop,6666p,LP_Portable
2987d,Mouse,7010p,O_Portable
5463g,Speaker,,
If you are using Python 3.x, use:
with open(outFile, 'w', newline='') as f_output:
ElementTree is not really the best tool for what I believe you're trying to do. Since you have well-formed, relatively simple xml, try using pandas:
import pandas as pd
#from here, it's just a one liner
pd.read_xml('input.xml',xpath='.//store').to_csv('output.csv',sep=',', index = None, header=True)
and that should get you your csv file.
Given parsing element values and their corresponding attributes involves a second layer of iteration, consider a nested list/dict comphrehension with dictionary merge. Also, use csv.DictWriter to build CSV via dictionaries:
from csv import DictWriter
import xml.etree.ElementTree as ET
ifilepath = "Input.xml"
tree = ET.parse(ifilepath)
nmsp = {"du": "http://www.dummytest.org"}
data = [
{
**{el.tag.split('}')[-1]: (el.text.strip() if el.text is not None else None) for el in d.findall("*")},
**{f"{el.tag.split('}')[-1]} {k}":v for el in d.findall("*") for k,v in el.attrib.items()},
**d.attrib
}
for d in tree.findall(".//du:data", namespaces=nmsp)
]
dkeys = list(data[0].keys())
with open("DummyXMLtoCSV.csv", "w", newline="") as f:
dw = DictWriter(f, fieldnames=dkeys)
dw.writeheader()
dw.writerows(data)
Output
indicator,country,date,value,unit,obs_status,decimal,indicator id,country id
"various, tests",test again,2021,1234567,,,0,AA.BB,MM
"testing, cases",coverage test,2020,3456223,,,0,XX.YY,DD
While above will add attributes to last columns of CSV. For specific ordering, re-order the dictionaries:
data = [ ... ]
cols = ["indicator id", "indicator", "country id", "country", "date", "value", "unit", "obs_status", "decimal"]
data = [
{k: d[k] for k in cols} for d in data
]
with open("DummyXMLtoCSV.csv", "w", newline="") as f:
dw = DictWriter(f, fieldnames=cols)
dw.writeheader()
dw.writerows(data)
Output
indicator id,indicator,country id,country,date,value,unit,obs_status,decimal
AA.BB,"various, tests",MM,test again,2021,1234567,,,0
XX.YY,"testing, cases",DD,coverage test,2020,3456223,,,0
We can use pd.json_normalize() to flatten the dictionary created from the XML. However, since records reside under two different keys: tag_2 and tag_7, we need to loop over those particular tags to get all the records, then concatenate the dataframes.
import pandas as pd
import xmltodict
with open("file_01.xml", "r", encoding="utf-8") as xml_fh:
str_xml = xml_fh.read()
dict_xml = xmltodict.parse(str_xml)
df = pd.concat(
[
pd.json_normalize(
dict_xml,
record_path=['tag_1', tag, 'date', 'data'], # path to record list
meta=[['tag_1', tag, 'date', '@value']]) # path to date
.pipe(lambda x: x.rename(columns={x.columns[-1]: 'date'})) # rename date column
.assign(tag_1='tag_1', tag_2=tag, data='data') # add meta columns
for tag in ('tag_2', 'tag_7') # loop over tags
]
)[['tag_1', 'tag_2', 'date', 'data', 'tag_3', 'tag_4', 'tag_5', 'tag_6']]
df.to_csv('file_01.csv', index=False)
This creates the following CSV file:
tag_1,tag_2,date,data,tag_3,tag_4,tag_5,tag_6
tag_1,tag_2,06-30-2023,data,val_3,val_4,val_5_1 & val_5_2,-0.157
tag_1,tag_2,06-30-2023,data,val_3,val_4_2,val_5_1,-0.173
tag_1,tag_7,06-30-2023,data,val_3,val_4,val_5_1 & val_5_2,-0.157
tag_1,tag_7,06-30-2023,data,val_3,val_4_2,val_5_1,-0.173
Perhaps a more maintainable way is to normalize the relevant sub-dictionary under each level 2 key. Note that in the code below, the record_path and meta paths are no longer lists.
def flatten_dict(dict_xml, level_2_tags):
df = (
pd.concat([
pd.json_normalize(dict_xml['tag_1'][tag]['date'], 'data', '@value')
.assign(tag_2=tag)
for tag in level_2_tags
])
.rename(columns={'@value': 'date'})
.assign(tag_1='tag_1', data='data')
.get(['tag_1', 'tag_2', 'date', 'data', 'tag_3', 'tag_4', 'tag_5', 'tag_6'])
)
return df
# test run
flatten_dict(dict_xml, ['tag_2']) # when there is only tag_2 in level=2
flatten_dict(dict_xml, ['tag_2', 'tag_7']) # when there are 2 tags in level=2
Given the custom format, it looks like the best option is to use a nested list comprehension:
df = pd.DataFrame([{'tag_1': k1, 'tag_2': k2, k3: d3['@value'], **d4}
for k1, d1 in dict_xml.items()
for k2, d2 in d1.items()
for k3, d3 in d2.items()
for d4 in d3['data']])
Output:
tag_1 tag_2 date tag_3 tag_4 tag_5 tag_6
0 tag_1 tag_2 06-30-2023 val_3 val_4 val_5_1 & val_5_2 -0.157
1 tag_1 tag_2 06-30-2023 val_3 val_4_2 val_5_1 -0.173
2 tag_1 tag_7 06-30-2023 val_3 val_4 val_5_1 & val_5_2 -0.157
3 tag_1 tag_7 06-30-2023 val_3 val_4_2 val_5_1 -0.173
CSV output:
# df.to_csv('file_01.csv', index=False)
tag_1,tag_2,date,tag_3,tag_4,tag_5,tag_6
tag_1,tag_2,06-30-2023,val_3,val_4,val_5_1 & val_5_2,-0.157
tag_1,tag_2,06-30-2023,val_3,val_4_2,val_5_1,-0.173
tag_1,tag_7,06-30-2023,val_3,val_4,val_5_1 & val_5_2,-0.157
tag_1,tag_7,06-30-2023,val_3,val_4_2,val_5_1,-0.173
Using pandas and BeautifulSoup you can achieve your expected output easily:
#Code:
import pandas as pd
import itertools
from bs4 import BeautifulSoup as b
with open("file.xml", "r") as f: # opening xml file
content = f.read()
soup = b(content, "lxml")
pkgeid = [ values.text for values in soup.findAll("pkgeid")]
pkgname = [ values.text for values in soup.findAll("pkgname")]
time = [ values.text for values in soup.findAll("time")]
oper = [ values.text for values in soup.findAll("oper")]
# For python-3.x use `zip_longest` method
# For python-2.x use 'izip_longest method
data = [item for item in itertools.zip_longest(time, oper, pkgeid, pkgname)]
df = pd.DataFrame(data=data)
df.to_csv("sample.csv",index=False, header=None)
#output in `sample.csv` file will be as follows:
2015-09-16T04:13:20Z,Create_Product,10,BBCWRL
2015-09-16T04:13:20Z,Create_Product,18,CNNINT
2018-04-01T03:30:28Z,Deactivate_Dhct,,
Using Pandas, parsing all xml fields.
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse("file.xml")
root = tree.getroot()
get_range = lambda col: range(len(col))
l = [{r[i].tag:r[i].text for i in get_range(r)} for r in root]
df = pd.DataFrame.from_dict(l)
df.to_csv('file.csv')
The lxml library is capable of very powerful XML parsing, and can be used to iterate over an XML tree to search for specific elements.
from lxml import etree
with open(r'path/to/xml', 'r') as xml:
text = xml.read()
tree = lxml.etree.fromstring(text)
row = ['', '']
for item in tree.iter('hw', 'def'):
if item.tag == 'hw':
row[0] = item.text
elif item.tag == 'def':
row[1] = item.text
line = ','.join(row)
with open(r'path/to/csv', 'a') as csv:
csv.write(line + '\n')
How you build the CSV file is largely based upon preference, but I have provided a trivial example above. If there are multiple <dps-data> tags, you could extract those elements first (which can be done with the same tree.iter method shown above), and then apply the above logic to each of them.
EDIT: I should point out that this particular implementation reads the entire XML file into memory. If you are working with a single 150mb file at a time, this should not be a problem, but it's just something to be aware of.
How about this:
from xml.dom import minidom
xmldoc = minidom.parse('your.xml')
hw_lst = xmldoc.getElementsByTagName('hw')
defu_lst = xmldoc.getElementsByTagName('def')
with open('your.csv', 'a') as out_file:
for i in range(len(hw_lst)):
out_file.write('{0}, {1}\n'.format(hw_lst[i].firstChild.data, defu_lst[i].firstChild.data))