Consider XSLT, the special purpose language designed to transform XML files and can directly convert XML to CSV (i.e., text file) without the pandas dataframe intermediary. Python's third-party module lxml (which you are already using) can run XSLT 1.0 scripts and do so without for loops or if logic. However, due to the complex alignment of product and attributes, some longer XPath searches are used with XSLT.
XSLT (save as .xsl file, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="no" method="text"/>
<xsl:strip-space elements="*"/>
<xsl:param name="delimiter">,</xsl:param>
<xsl:template match="/PropertySet">
<xsl:text>ProductId,Product,AttributeId,Attribute
</xsl:text>
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="PropertySet|Message|ListOf_Class_Def|ListOf_Prod_Def|ImpExp">
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="ListOfObject_Arrt">
<xsl:apply-templates select="Object_Arrt"/>
<xsl:if test="name(*) != 'Object_Arrt' and preceding-sibling::ListOfObject_Def/Object_Def/@Ancestor_Name = ''">
<xsl:value-of select="concat(ancestor::ImpExp/@Name, $delimiter,
ancestor::ImpExp/@Object_Num, $delimiter,
'', $delimiter,
'')"/><xsl:text>
</xsl:text>
</xsl:if>
</xsl:template>
<xsl:template match="Object_Arrt">
<xsl:variable name="attrName" select="ancestor::ImpExp/@Name"/>
<xsl:value-of select="concat(/PropertySet/PropertySet/Message[@IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/@Ancestor_Name = $attrName]/@Name, $delimiter,
/PropertySet/PropertySet/Message[@IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/@Ancestor_Name = $attrName]/@Object_Num, $delimiter,
@Orig_Id, $delimiter,
@Attr_Name)"/><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
Python
import lxml.etree as et
# LOAD XML AND XSL
xml = et.parse('Input.xml')
xsl = et.parse('XSLT_Script.xsl')
# RUN TRANSFORMATION
transform = et.XSLT(xsl)
result = transform(xml)
# OUTPUT TO FILE
with open('Output.csv', 'wb') as f:
f.write(result)
Output
ProductId,Product,AttributeId,Attribute
Laptop,2008a,6666p,LP_Portable
Mouse,2987d,7010p,O_Portable
Mouse,2987d,7012j,O_wireless
Speaker,5463g,,
Answer from Parfait on Stack OverflowVideos
Consider XSLT, the special purpose language designed to transform XML files and can directly convert XML to CSV (i.e., text file) without the pandas dataframe intermediary. Python's third-party module lxml (which you are already using) can run XSLT 1.0 scripts and do so without for loops or if logic. However, due to the complex alignment of product and attributes, some longer XPath searches are used with XSLT.
XSLT (save as .xsl file, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="no" method="text"/>
<xsl:strip-space elements="*"/>
<xsl:param name="delimiter">,</xsl:param>
<xsl:template match="/PropertySet">
<xsl:text>ProductId,Product,AttributeId,Attribute
</xsl:text>
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="PropertySet|Message|ListOf_Class_Def|ListOf_Prod_Def|ImpExp">
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="ListOfObject_Arrt">
<xsl:apply-templates select="Object_Arrt"/>
<xsl:if test="name(*) != 'Object_Arrt' and preceding-sibling::ListOfObject_Def/Object_Def/@Ancestor_Name = ''">
<xsl:value-of select="concat(ancestor::ImpExp/@Name, $delimiter,
ancestor::ImpExp/@Object_Num, $delimiter,
'', $delimiter,
'')"/><xsl:text>
</xsl:text>
</xsl:if>
</xsl:template>
<xsl:template match="Object_Arrt">
<xsl:variable name="attrName" select="ancestor::ImpExp/@Name"/>
<xsl:value-of select="concat(/PropertySet/PropertySet/Message[@IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/@Ancestor_Name = $attrName]/@Name, $delimiter,
/PropertySet/PropertySet/Message[@IntObjectName='Prod Def']/ListOf_Prod_Def/
ImpExp[ListOfObject_Def/Object_Def/@Ancestor_Name = $attrName]/@Object_Num, $delimiter,
@Orig_Id, $delimiter,
@Attr_Name)"/><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
Python
import lxml.etree as et
# LOAD XML AND XSL
xml = et.parse('Input.xml')
xsl = et.parse('XSLT_Script.xsl')
# RUN TRANSFORMATION
transform = et.XSLT(xsl)
result = transform(xml)
# OUTPUT TO FILE
with open('Output.csv', 'wb') as f:
f.write(result)
Output
ProductId,Product,AttributeId,Attribute
Laptop,2008a,6666p,LP_Portable
Mouse,2987d,7010p,O_Portable
Mouse,2987d,7012j,O_wireless
Speaker,5463g,,
You would need to preparse all of the CLASS_DEF entries into a dictionary. These can then be looked up when processing the PROD_DEF entries:
import csv
from lxml import etree
inFile = "./newm.xml"
outFile = "./new.csv"
tree = etree.parse(inFile)
class_defs = {}
# First extract all the CLASS_DEF entries into a dictionary
for impexp in tree.iter("ImpExp"):
name = impexp.get('Name')
if impexp.get('Type') == "CLASS_DEF":
for list_of_object_arrt in impexp.findall('ListOfObject_Arrt'):
class_defs[name] = [(obj.get('Orig_Id'), obj.get('Attr_Name')) for obj in list_of_object_arrt]
with open(outFile, 'wb') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['ProductId', 'Product', 'AttributeId', 'Attribute'])
for impexp in tree.iter("ImpExp"):
object_num = impexp.get('Object_Num')
name = impexp.get('Name')
if impexp.get('Type') == "PROD_DEF":
for list_of_object_def in impexp.findall('ListOfObject_Def'):
for obj in list_of_object_def:
ancestor_num = obj.get('Ancestor_Num')
ancestor_name = obj.get('Ancestor_Name')
csv_output.writerow([object_num, name] + list(class_defs.get(ancestor_name, [['', '']])[0]))
This would produce new.csv containing:
ProductId,Product,AttributeId,Attribute
2008a,Laptop,6666p,LP_Portable
2987d,Mouse,7010p,O_Portable
5463g,Speaker,,
If you are using Python 3.x, use:
with open(outFile, 'w', newline='') as f_output:
ElementTree is not really the best tool for what I believe you're trying to do. Since you have well-formed, relatively simple xml, try using pandas:
import pandas as pd
#from here, it's just a one liner
pd.read_xml('input.xml',xpath='.//store').to_csv('output.csv',sep=',', index = None, header=True)
and that should get you your csv file.
Given parsing element values and their corresponding attributes involves a second layer of iteration, consider a nested list/dict comphrehension with dictionary merge. Also, use csv.DictWriter to build CSV via dictionaries:
from csv import DictWriter
import xml.etree.ElementTree as ET
ifilepath = "Input.xml"
tree = ET.parse(ifilepath)
nmsp = {"du": "http://www.dummytest.org"}
data = [
{
**{el.tag.split('}')[-1]: (el.text.strip() if el.text is not None else None) for el in d.findall("*")},
**{f"{el.tag.split('}')[-1]} {k}":v for el in d.findall("*") for k,v in el.attrib.items()},
**d.attrib
}
for d in tree.findall(".//du:data", namespaces=nmsp)
]
dkeys = list(data[0].keys())
with open("DummyXMLtoCSV.csv", "w", newline="") as f:
dw = DictWriter(f, fieldnames=dkeys)
dw.writeheader()
dw.writerows(data)
Output
indicator,country,date,value,unit,obs_status,decimal,indicator id,country id
"various, tests",test again,2021,1234567,,,0,AA.BB,MM
"testing, cases",coverage test,2020,3456223,,,0,XX.YY,DD
While above will add attributes to last columns of CSV. For specific ordering, re-order the dictionaries:
data = [ ... ]
cols = ["indicator id", "indicator", "country id", "country", "date", "value", "unit", "obs_status", "decimal"]
data = [
{k: d[k] for k in cols} for d in data
]
with open("DummyXMLtoCSV.csv", "w", newline="") as f:
dw = DictWriter(f, fieldnames=cols)
dw.writeheader()
dw.writerows(data)
Output
indicator id,indicator,country id,country,date,value,unit,obs_status,decimal
AA.BB,"various, tests",MM,test again,2021,1234567,,,0
XX.YY,"testing, cases",DD,coverage test,2020,3456223,,,0
We can use pd.json_normalize() to flatten the dictionary created from the XML. However, since records reside under two different keys: tag_2 and tag_7, we need to loop over those particular tags to get all the records, then concatenate the dataframes.
import pandas as pd
import xmltodict
with open("file_01.xml", "r", encoding="utf-8") as xml_fh:
str_xml = xml_fh.read()
dict_xml = xmltodict.parse(str_xml)
df = pd.concat(
[
pd.json_normalize(
dict_xml,
record_path=['tag_1', tag, 'date', 'data'], # path to record list
meta=[['tag_1', tag, 'date', '@value']]) # path to date
.pipe(lambda x: x.rename(columns={x.columns[-1]: 'date'})) # rename date column
.assign(tag_1='tag_1', tag_2=tag, data='data') # add meta columns
for tag in ('tag_2', 'tag_7') # loop over tags
]
)[['tag_1', 'tag_2', 'date', 'data', 'tag_3', 'tag_4', 'tag_5', 'tag_6']]
df.to_csv('file_01.csv', index=False)
This creates the following CSV file:
tag_1,tag_2,date,data,tag_3,tag_4,tag_5,tag_6
tag_1,tag_2,06-30-2023,data,val_3,val_4,val_5_1 & val_5_2,-0.157
tag_1,tag_2,06-30-2023,data,val_3,val_4_2,val_5_1,-0.173
tag_1,tag_7,06-30-2023,data,val_3,val_4,val_5_1 & val_5_2,-0.157
tag_1,tag_7,06-30-2023,data,val_3,val_4_2,val_5_1,-0.173
Perhaps a more maintainable way is to normalize the relevant sub-dictionary under each level 2 key. Note that in the code below, the record_path and meta paths are no longer lists.
def flatten_dict(dict_xml, level_2_tags):
df = (
pd.concat([
pd.json_normalize(dict_xml['tag_1'][tag]['date'], 'data', '@value')
.assign(tag_2=tag)
for tag in level_2_tags
])
.rename(columns={'@value': 'date'})
.assign(tag_1='tag_1', data='data')
.get(['tag_1', 'tag_2', 'date', 'data', 'tag_3', 'tag_4', 'tag_5', 'tag_6'])
)
return df
# test run
flatten_dict(dict_xml, ['tag_2']) # when there is only tag_2 in level=2
flatten_dict(dict_xml, ['tag_2', 'tag_7']) # when there are 2 tags in level=2
Given the custom format, it looks like the best option is to use a nested list comprehension:
df = pd.DataFrame([{'tag_1': k1, 'tag_2': k2, k3: d3['@value'], **d4}
for k1, d1 in dict_xml.items()
for k2, d2 in d1.items()
for k3, d3 in d2.items()
for d4 in d3['data']])
Output:
tag_1 tag_2 date tag_3 tag_4 tag_5 tag_6
0 tag_1 tag_2 06-30-2023 val_3 val_4 val_5_1 & val_5_2 -0.157
1 tag_1 tag_2 06-30-2023 val_3 val_4_2 val_5_1 -0.173
2 tag_1 tag_7 06-30-2023 val_3 val_4 val_5_1 & val_5_2 -0.157
3 tag_1 tag_7 06-30-2023 val_3 val_4_2 val_5_1 -0.173
CSV output:
# df.to_csv('file_01.csv', index=False)
tag_1,tag_2,date,tag_3,tag_4,tag_5,tag_6
tag_1,tag_2,06-30-2023,val_3,val_4,val_5_1 & val_5_2,-0.157
tag_1,tag_2,06-30-2023,val_3,val_4_2,val_5_1,-0.173
tag_1,tag_7,06-30-2023,val_3,val_4,val_5_1 & val_5_2,-0.157
tag_1,tag_7,06-30-2023,val_3,val_4_2,val_5_1,-0.173
Using pandas and BeautifulSoup you can achieve your expected output easily:
#Code:
import pandas as pd
import itertools
from bs4 import BeautifulSoup as b
with open("file.xml", "r") as f: # opening xml file
content = f.read()
soup = b(content, "lxml")
pkgeid = [ values.text for values in soup.findAll("pkgeid")]
pkgname = [ values.text for values in soup.findAll("pkgname")]
time = [ values.text for values in soup.findAll("time")]
oper = [ values.text for values in soup.findAll("oper")]
# For python-3.x use `zip_longest` method
# For python-2.x use 'izip_longest method
data = [item for item in itertools.zip_longest(time, oper, pkgeid, pkgname)]
df = pd.DataFrame(data=data)
df.to_csv("sample.csv",index=False, header=None)
#output in `sample.csv` file will be as follows:
2015-09-16T04:13:20Z,Create_Product,10,BBCWRL
2015-09-16T04:13:20Z,Create_Product,18,CNNINT
2018-04-01T03:30:28Z,Deactivate_Dhct,,
Using Pandas, parsing all xml fields.
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse("file.xml")
root = tree.getroot()
get_range = lambda col: range(len(col))
l = [{r[i].tag:r[i].text for i in get_range(r)} for r in root]
df = pd.DataFrame.from_dict(l)
df.to_csv('file.csv')
The lxml library is capable of very powerful XML parsing, and can be used to iterate over an XML tree to search for specific elements.
from lxml import etree
with open(r'path/to/xml', 'r') as xml:
text = xml.read()
tree = lxml.etree.fromstring(text)
row = ['', '']
for item in tree.iter('hw', 'def'):
if item.tag == 'hw':
row[0] = item.text
elif item.tag == 'def':
row[1] = item.text
line = ','.join(row)
with open(r'path/to/csv', 'a') as csv:
csv.write(line + '\n')
How you build the CSV file is largely based upon preference, but I have provided a trivial example above. If there are multiple <dps-data> tags, you could extract those elements first (which can be done with the same tree.iter method shown above), and then apply the above logic to each of them.
EDIT: I should point out that this particular implementation reads the entire XML file into memory. If you are working with a single 150mb file at a time, this should not be a problem, but it's just something to be aware of.
How about this:
from xml.dom import minidom
xmldoc = minidom.parse('your.xml')
hw_lst = xmldoc.getElementsByTagName('hw')
defu_lst = xmldoc.getElementsByTagName('def')
with open('your.csv', 'a') as out_file:
for i in range(len(hw_lst)):
out_file.write('{0}, {1}\n'.format(hw_lst[i].firstChild.data, defu_lst[i].firstChild.data))
While XML as a data format can take many forms from flat to deeply nested, data frames must adhere to a single structure of two dimensions: row by column. Hence, as noted in docs, pandas.read_xml, is a convenience method best for flatter, shallow XML files. You can use xpath to traverse different areas of the document, not just the default /*.
However, you can use XSLT 1.0 (special purpose language designed to transform XML files) with the default parser, lxml, to transform any XML to the needed flat format of data frame. Below stylesheet will restyle the <slike> node for comma-separated text of its children <slika>:
XSLT (save as .xsl file, a special .xml file)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" omit-xml-declaration="no" indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="slike">
<xsl:copy>
<xsl:for-each select="*">
<xsl:value-of select="text()"/>
<xsl:if test="position() != last()">
<xsl:text>,</xsl:text>
</xsl:if>
</xsl:for-each>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
Online Demo
Python
artikal_df = pd.read_xml("my_filename.xml", stylesheet="my_style.xsl")
# CONVERT COMMA-SEPARATED VALUES TO EMBEDDED LISTS
artikal_df["slike"] = artikal_df["slike"].str.split(',')
# PREFIX PARENT NODE NAME
artikal_df = artikal_df.add_prefix('artikal_')
artikal_df.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 2 entries, 0 to 1
# Data columns (total 12 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 artikal_id 2 non-null int64
# 1 artikal_sifra 2 non-null int64
# 2 artikal_barKod 2 non-null int64
# 3 artikal_naziv 2 non-null object
# 4 artikal_kategorija1 2 non-null object
# 5 artikal_kategorija2 2 non-null object
# 6 artikal_kategorija3 2 non-null object
# 7 artikal_vpCena 2 non-null float64
# 8 artikal_mpCena 2 non-null float64
# 9 artikal_dostupan 2 non-null int64
# 10 artikal_opis 0 non-null float64
# 11 artikal_slike 2 non-null object
# dtypes: float64(3), int64(4), object(5)
# memory usage: 320.0+ bytes
You start by reading the xml file and also making a placeholder file for you to write the output in a csv format (or any other text format - you might have to tweak the code a bit).
Then you specify the names of columns in your final dataframe (after you have parsed the xml file). But this information is already in your xml file anyways, so you just to make sure you understand the contents.
Lastly, loop over the entries and find the keywords (column names) to read and write to the csv.
Once done, you can read the csv using pd.read_csv('output.csv').
import xml.etree.ElementTree as ET
import csv
# Load and parse the XML file
tree = ET.parse('your_xml_file.xml')
root = tree.getroot()
# Define the CSV file and writer
csv_file = open('output.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
# Write header row
header = ['column1', 'column2', 'column3', 'column4', 'column5']
csv_writer.writerow(header)
# Extract data and write to CSV
for id in root.findall('.//main_identifier'):
column1_text = id.find('column1').text if id.find('column') is not None else ''
column2_text = id.find('.//column2').text if id.find('.//column2') is not None else ''
column3_text = id.find('.//column3').text if id.find('.//column3') is not None else ''
column4 = id.find('.//column4').text if id.find('.//column4') is not None else ''
column5_text = id.find('.//column5').text if id.find('.//column5') is not None else ''
# Write data to CSV
csv_writer.writerow([column1_text, column2_text, column3_text, column4_text, column5_text])
# Close the CSV file
csv_file.close()