You can easily use xml (from the Python standard library) to convert to a pandas.DataFrame. Here's what I would do (when reading from a file replace xml_data with the name of your file or file object):
import pandas as pd
import xml.etree.ElementTree as ET
import io
def iter_docs(author):
author_attr = author.attrib
for doc in author.iter('document'):
doc_dict = author_attr.copy()
doc_dict.update(doc.attrib)
doc_dict['data'] = doc.text
yield doc_dict
xml_data = io.StringIO(u'''YOUR XML STRING HERE''')
etree = ET.parse(xml_data) #create an ElementTree object
doc_df = pd.DataFrame(list(iter_docs(etree.getroot())))
If there are multiple authors in your original document or the root of your XML is not an author, then I would add the following generator:
def iter_author(etree):
for author in etree.iter('author'):
for row in iter_docs(author):
yield row
and change doc_df = pd.DataFrame(list(iter_docs(etree.getroot()))) to doc_df = pd.DataFrame(list(iter_author(etree)))
Have a look at the ElementTree tutorial provided in the xml library documentation.
Videos
You can easily use xml (from the Python standard library) to convert to a pandas.DataFrame. Here's what I would do (when reading from a file replace xml_data with the name of your file or file object):
import pandas as pd
import xml.etree.ElementTree as ET
import io
def iter_docs(author):
author_attr = author.attrib
for doc in author.iter('document'):
doc_dict = author_attr.copy()
doc_dict.update(doc.attrib)
doc_dict['data'] = doc.text
yield doc_dict
xml_data = io.StringIO(u'''YOUR XML STRING HERE''')
etree = ET.parse(xml_data) #create an ElementTree object
doc_df = pd.DataFrame(list(iter_docs(etree.getroot())))
If there are multiple authors in your original document or the root of your XML is not an author, then I would add the following generator:
def iter_author(etree):
for author in etree.iter('author'):
for row in iter_docs(author):
yield row
and change doc_df = pd.DataFrame(list(iter_docs(etree.getroot()))) to doc_df = pd.DataFrame(list(iter_author(etree)))
Have a look at the ElementTree tutorial provided in the xml library documentation.
As of v1.3, you can simply use:
pandas.read_xml(path_or_file)
if the data is simple, like this, then you can do something like:
from lxml import objectify
xml = objectify.parse('Document1.xml')
root = xml.getroot()
bathrooms = [child.text for child in root['bathrooms'].getchildren()]
price = [child.text for child in root['price'].getchildren()]
property_id = [child.text for child in root['property_id'].getchildren()]
data = [bathrooms, price, property_id]
df = pd.DataFrame(data).T
df.columns = ['bathrooms', 'price', 'property_id']
bathrooms price property_id
0 1.0 7020000.0 35237.0
1 3.0 10000000.0 32238.0
2 nan 4128000.0 44699.0
if it is more complex then a loop is better. You can do something like
from lxml import objectify
xml = objectify.parse('Document1.xml')
root = xml.getroot()
data=[]
for i in range(len(root.getchildren())):
data.append([child.text for child in root.getchildren()[i].getchildren()])
df = pd.DataFrame(data).T
df.columns = ['bathrooms', 'price', 'property_id']
Hello all I found another really easily way to solve those question. reference: https://www.youtube.com/watch?v=WVrg5-cjr5k
import xml.etree.ElementTree as ET
import pandas as pd
import codecs
## open notebook and save your xml file to text.xml
with codecs.open('text.xml', 'r', encoding='utf8') as f:
tt = f.read()
def xml2df(xml_data):
root = ET.XML(xml_data)
all_records = []
for i, child in enumerate(root):
record = {}
for sub_child in child:
record[sub_child.tag] = sub_child.text
all_records.append(record)
return pd.DataFrame(all_records)
df_xml1 = xml2df(tt)
print(df_xml1)
for better understanding of ET you can use underneath code to see what in side of your xml
import xml.etree.ElementTree as ET
import pandas as pd
import codecs
with codecs.open('text.xml', 'r', encoding='utf8') as f:
tt = f.read()
root = ET.XML(tt)
print(type(root))
print(root[0])
for ele in root[0]:
print(ele.tag + '////' + ele.text)
print(root[0][0].tag)
Once you finish running the program you can see the output underneath:
C:\Users\username\Documents\pycode\Scripts\python.exe C:/Users/username/PycharmProjects/DestinationLight/try.py
n35237 n32238 n44699
0 1.0 3.0 nan
1 7020000.0 10000000.0 4128000.0
2 35237.0 32238.0 44699.0
<class 'xml.etree.ElementTree.Element'>
<Element 'bathrooms' at 0x00000285006B6180>
n35237////1.0
n32238////3.0
n44699////nan
n35237
Process finished with exit code 0