You can easily use xml (from the Python standard library) to convert to a pandas.DataFrame. Here's what I would do (when reading from a file replace xml_data with the name of your file or file object):
import pandas as pd
import xml.etree.ElementTree as ET
import io
def iter_docs(author):
author_attr = author.attrib
for doc in author.iter('document'):
doc_dict = author_attr.copy()
doc_dict.update(doc.attrib)
doc_dict['data'] = doc.text
yield doc_dict
xml_data = io.StringIO(u'''YOUR XML STRING HERE''')
etree = ET.parse(xml_data) #create an ElementTree object
doc_df = pd.DataFrame(list(iter_docs(etree.getroot())))
If there are multiple authors in your original document or the root of your XML is not an author, then I would add the following generator:
def iter_author(etree):
for author in etree.iter('author'):
for row in iter_docs(author):
yield row
and change doc_df = pd.DataFrame(list(iter_docs(etree.getroot()))) to doc_df = pd.DataFrame(list(iter_author(etree)))
Have a look at the ElementTree tutorial provided in the xml library documentation.
Videos
You can easily use xml (from the Python standard library) to convert to a pandas.DataFrame. Here's what I would do (when reading from a file replace xml_data with the name of your file or file object):
import pandas as pd
import xml.etree.ElementTree as ET
import io
def iter_docs(author):
author_attr = author.attrib
for doc in author.iter('document'):
doc_dict = author_attr.copy()
doc_dict.update(doc.attrib)
doc_dict['data'] = doc.text
yield doc_dict
xml_data = io.StringIO(u'''YOUR XML STRING HERE''')
etree = ET.parse(xml_data) #create an ElementTree object
doc_df = pd.DataFrame(list(iter_docs(etree.getroot())))
If there are multiple authors in your original document or the root of your XML is not an author, then I would add the following generator:
def iter_author(etree):
for author in etree.iter('author'):
for row in iter_docs(author):
yield row
and change doc_df = pd.DataFrame(list(iter_docs(etree.getroot()))) to doc_df = pd.DataFrame(list(iter_author(etree)))
Have a look at the ElementTree tutorial provided in the xml library documentation.
As of v1.3, you can simply use:
pandas.read_xml(path_or_file)
Use [] to filter and reorganize columns:
cols = ['Application_ID', 'Product_Type', 'Product_ID']
df = pd.read_xml('product.xml')[cols]
print(df)
# Output:
Application_ID Product_Type Product_ID
0 BBC#:1010 1 32
1 NBA#:1111 2 22
2 BBC#:1212 1 63
3 NBA#:2210 2 22
If you want to replace '_' from your column names by ' ':
df.columns = df.columns.str.replace('_', ' ')
print(df)
# Output:
Application ID Product Type Product ID
0 BBC#:1010 1 32
1 NBA#:1111 2 22
2 BBC#:1212 1 63
3 NBA#:2210 2 22
As of Pandas 1.3.0 there is a read_xml() function that makes working with reading/writing XML data in/out of pandas much easier.
Once you upgrade to Pandas >1.3.0 you can simply use:
df = pd.read_xml("___XML_FILEPATH___")
print(df)
(Note that in the XML sample above the <Rowset> tag needs to be closed)
if the data is simple, like this, then you can do something like:
from lxml import objectify
xml = objectify.parse('Document1.xml')
root = xml.getroot()
bathrooms = [child.text for child in root['bathrooms'].getchildren()]
price = [child.text for child in root['price'].getchildren()]
property_id = [child.text for child in root['property_id'].getchildren()]
data = [bathrooms, price, property_id]
df = pd.DataFrame(data).T
df.columns = ['bathrooms', 'price', 'property_id']
bathrooms price property_id
0 1.0 7020000.0 35237.0
1 3.0 10000000.0 32238.0
2 nan 4128000.0 44699.0
if it is more complex then a loop is better. You can do something like
from lxml import objectify
xml = objectify.parse('Document1.xml')
root = xml.getroot()
data=[]
for i in range(len(root.getchildren())):
data.append([child.text for child in root.getchildren()[i].getchildren()])
df = pd.DataFrame(data).T
df.columns = ['bathrooms', 'price', 'property_id']
Hello all I found another really easily way to solve those question. reference: https://www.youtube.com/watch?v=WVrg5-cjr5k
import xml.etree.ElementTree as ET
import pandas as pd
import codecs
## open notebook and save your xml file to text.xml
with codecs.open('text.xml', 'r', encoding='utf8') as f:
tt = f.read()
def xml2df(xml_data):
root = ET.XML(xml_data)
all_records = []
for i, child in enumerate(root):
record = {}
for sub_child in child:
record[sub_child.tag] = sub_child.text
all_records.append(record)
return pd.DataFrame(all_records)
df_xml1 = xml2df(tt)
print(df_xml1)
for better understanding of ET you can use underneath code to see what in side of your xml
import xml.etree.ElementTree as ET
import pandas as pd
import codecs
with codecs.open('text.xml', 'r', encoding='utf8') as f:
tt = f.read()
root = ET.XML(tt)
print(type(root))
print(root[0])
for ele in root[0]:
print(ele.tag + '////' + ele.text)
print(root[0][0].tag)
Once you finish running the program you can see the output underneath:
C:\Users\username\Documents\pycode\Scripts\python.exe C:/Users/username/PycharmProjects/DestinationLight/try.py
n35237 n32238 n44699
0 1.0 3.0 nan
1 7020000.0 10000000.0 4128000.0
2 35237.0 32238.0 44699.0
<class 'xml.etree.ElementTree.Element'>
<Element 'bathrooms' at 0x00000285006B6180>
n35237////1.0
n32238////3.0
n44699////nan
n35237
Process finished with exit code 0
Indeed, in forthcoming Pandas 1.3, read_xml will allow you to migrate parsed nodes into data frames. However, because XML can have many dimensions beyond the 2D of rows by columns, as noted:
This method is best designed to import shallow XML documents
Therefore, any nested elements are not immediately picked up as shown here with about 20 columns. Notice the required use of namespaces due to the default namespace in document.
Pandas 1.3+
url = "https://www.sec.gov/Archives/edgar/data/1279392/000114554921008161/primary_doc.xml"
df = pd.read_xml(url, xpath="//edgar:invstOrSec",
namespaces={"edgar": "http://www.sec.gov/edgar/nport"})
print(df)
# name lei title cusip ... fairValLevel securityLending assetCat debtSec
# 0 Tastemade Inc. NaN Tastemade Inc. 999999999 ... 3.0 NaN None NaN
# 1 Regatta XV Funding Ltd., Subordinated Note, Pr... NaN Regatta XV Funding Ltd., Subordinated Note, Pr... 75888PAC7 ... 2.0 NaN ABS-CBDO NaN
# 2 Hired, Inc., Series C Preferred Stock NaN Hired, Inc., Series C Preferred Stock NaN ... 3.0 NaN EP NaN
# 3 WESTVIEW CAPITAL PARTNERS II LP NaN WESTVIEW CAPITAL PARTNERS II LP 999999999 ... NaN NaN None NaN
# 4 VOYAGER CAPITAL FUND III, L.P. NaN VOYAGER CAPITAL FUND III, L.P. 999999999 ... NaN NaN None NaN
.. ... ... ... ... ... ... ... ... ...
# 158 ARCLIGHT ENERGY PARTNERS FUND V, L.P. NaN ARCLIGHT ENERGY PARTNERS FUND V, L.P. 999999999 ... NaN NaN None NaN
# 159 ALLOY MERCHANT PARTNERS L.P. NaN ALLOY MERCHANT PARTNERS L.P. 999999999 ... NaN NaN None NaN
# 160 ADVENT LATIN AMERICAN PRIVATE EQUITY FUND V-F ... NaN ADVENT LATIN AMERICAN PRIVATE EQUITY FUND V-F ... 999999999 ... NaN NaN None NaN
# 161 ABRY ADVANCED SECURITIES FUND LP NaN ABRY ADVANCED SECURITIES FUND LP 999999999 ... NaN NaN None NaN
# 162 ADVENT LATIN AMERICAN PRIVATE EQUITY FUND IV-F... NaN ADVENT LATIN AMERICAN PRIVATE EQUITY FUND IV-F... 999999999 ... NaN NaN None NaN
# [163 rows x 20 columns]
url = "https://www.sec.gov/Archives/edgar/data/1279394/000114554921008162/primary_doc.xml"
df = pd.read_xml(url, xpath="//edgar:invstOrSec",
namespaces={"edgar": "http://www.sec.gov/edgar/nport"})
print(df)
# name lei title cusip ... invCountry isRestrictedSec fairValLevel securityLending
# 0 Salient Private Access Master Fund, L.P. NaN Salient Private Access Master Fund, L.P. 999999999 ... US Y NaN NaN
# [1 rows x 18 columns]
Fortunately, read_xml supports XSLT (special-purpose language designed to transform XML documents) with default lxml parser. With XSLT, you can then flatten needed nodes for migration to retrieve the 32 columns.
xsl = """<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:edgar="http://www.sec.gov/edgar/nport">
<xsl:output method="xml" indent="yes" />
<xsl:strip-space elements="*"/>
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="edgar:invstOrSec">
<xsl:copy>
<xsl:apply-templates select="*|*/*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
"""
url = "https://www.sec.gov/Archives/edgar/data/1279392/000114554921008161/primary_doc.xml"
df = pd.read_xml(url, xpath="//edgar:invstOrSec", namespaces={"edgar": "http://www.sec.gov/edgar/nport"},
stylesheet=xsl)
print(df)
# name lei title cusip ... annualizedRt isDefault areIntrstPmntsInArrs isPaidKind
# 0 Tastemade Inc. NaN Tastemade Inc. 999999999 ... NaN None None None
# 1 Regatta XV Funding Ltd., Subordinated Note, Pr... NaN Regatta XV Funding Ltd., Subordinated Note, Pr... 75888PAC7 ... 0.0624 N N N
# 2 Hired, Inc., Series C Preferred Stock NaN Hired, Inc., Series C Preferred Stock NaN ... NaN None None None
# 3 WESTVIEW CAPITAL PARTNERS II LP NaN WESTVIEW CAPITAL PARTNERS II LP 999999999 ... NaN None None None
# 4 VOYAGER CAPITAL FUND III, L.P. NaN VOYAGER CAPITAL FUND III, L.P. 999999999 ... NaN None None None
.. ... ... ... ... ... ... ... ... ...
# 158 ARCLIGHT ENERGY PARTNERS FUND V, L.P. NaN ARCLIGHT ENERGY PARTNERS FUND V, L.P. 999999999 ... NaN None None None
# 159 ALLOY MERCHANT PARTNERS L.P. NaN ALLOY MERCHANT PARTNERS L.P. 999999999 ... NaN None None None
# 160 ADVENT LATIN AMERICAN PRIVATE EQUITY FUND V-F ... NaN ADVENT LATIN AMERICAN PRIVATE EQUITY FUND V-F ... 999999999 ... NaN None None None
# 161 ABRY ADVANCED SECURITIES FUND LP NaN ABRY ADVANCED SECURITIES FUND LP 999999999 ... NaN None None None
# 162 ADVENT LATIN AMERICAN PRIVATE EQUITY FUND IV-F... NaN ADVENT LATIN AMERICAN PRIVATE EQUITY FUND IV-F... 999999999 ... NaN None None None
# [163 rows x 32 columns]
Pandas < 1.3
To achieve same result via XPath approach requires more steps where you will have to handle URL request and XML parsing to data frame build. Specifically, create a list of dictionaries from transformed, parsed XML and pass into DataFrame constructor. Below uses same XSLT and XPath with namespace as above.
import lxml.etree as lx
import pandas as pd
import urllib.request as rq
url = "https://www.sec.gov/Archives/edgar/data/1279392/000114554921008161/primary_doc.xml"
xsl = """<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:edgar="http://www.sec.gov/edgar/nport">
<xsl:output method="xml" indent="yes" />
<xsl:strip-space elements="*"/>
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="edgar:invstOrSec">
<xsl:copy>
<xsl:apply-templates select="*|*/*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
"""
content = rq.urlopen(url)
# LOAD XML AND XSL
doc = lx.fromstring(content.read())
style = lx.fromstring(xsl)
# INITIALIZE AND TRANSFORM ORIGINAL DOC
transformer = lx.XSLT(style)
result = transformer(doc)
# RUN XPATH PARSING ON FLATTER XML
data = [{node.tag.split('}')[1]:node.text for node in inv.xpath("*")
} for inv in result.xpath("//edgar:invstOrSec",
namespaces={"edgar": "http://www.sec.gov/edgar/nport"})]
# BIND DATA FOR DATA FRAME
df = pd.DataFrame(data)
print(df)
# name lei title ... isDefault areIntrstPmntsInArrs isPaidKind
# 0 Tastemade Inc. N/A Tastemade Inc. ... NaN NaN NaN
# 1 Regatta XV Funding Ltd., Subordinated Note, Pr... N/A Regatta XV Funding Ltd., Subordinated Note, Pr... ... N N N
# 2 Hired, Inc., Series C Preferred Stock N/A Hired, Inc., Series C Preferred Stock ... NaN NaN NaN
# 3 WESTVIEW CAPITAL PARTNERS II LP N/A WESTVIEW CAPITAL PARTNERS II LP ... NaN NaN NaN
# 4 VOYAGER CAPITAL FUND III, L.P. N/A VOYAGER CAPITAL FUND III, L.P. ... NaN NaN NaN
# .. ... ... ... ... ... ... ...
# 158 ARCLIGHT ENERGY PARTNERS FUND V, L.P. N/A ARCLIGHT ENERGY PARTNERS FUND V, L.P. ... NaN NaN NaN
# 159 ALLOY MERCHANT PARTNERS L.P. N/A ALLOY MERCHANT PARTNERS L.P. ... NaN NaN NaN
# 160 ADVENT LATIN AMERICAN PRIVATE EQUITY FUND V-F ... N/A ADVENT LATIN AMERICAN PRIVATE EQUITY FUND V-F ... ... NaN NaN NaN
# 161 ABRY ADVANCED SECURITIES FUND LP N/A ABRY ADVANCED SECURITIES FUND LP ... NaN NaN NaN
# 162 ADVENT LATIN AMERICAN PRIVATE EQUITY FUND IV-F... N/A ADVENT LATIN AMERICAN PRIVATE EQUITY FUND IV-F... ... NaN NaN NaN
# [163 rows x 32 columns]
First of all, thanks for the feedback! I wrote pandas-read-xml because pandas did not have a pd.read_xml() implementation. You (and the rest of us) will be pleased to know that there is a dev version of pandas read_xml which should be coming soon! (https://pandas.pydata.org/docs/dev/reference/api/pandas.read_xml.html)
As for you current conundrum, this is a result (and one of my many dislikes towards) of the structure of XML. Unlike JSON, where single elements can be returned within a list, the XML structure just has one XML tag, which is interpreted as a single value rather than a list.
Essentially, if there is only one "row" tag, then the "column" tags is now treated as column tags... I'm not making much sense am I? Let me explain with your examples.
Here is how I suggest you use it:
# Import package
import pandas_read_xml as pdx
from pandas_read_xml import fully_flatten
# Example 1
url_1 = 'https://www.sec.gov/Archives/edgar/data/1279392/000114554921008161/primary_doc.xml'
df_1 = pdx.read_xml(url_1,['edgarSubmission', 'formData','invstOrSecs', 'invstOrSec']).pipe(fully_flatten)
# Example 2
url_2 = "https://www.sec.gov/Archives/edgar/data/1279394/000114554921008162/primary_doc.xml"
df_2 = pdx.read_xml(url_2,['edgarSubmission', 'formData', 'invstOrSecs'], transpose=True).pipe(fully_flatten)
df_2
What is the difference?
In Example 1, you already expect multiple within tag. So, passing the root_tag_list=['edgarSubmission', 'formData','invstOrSecs', 'invstOrSec'] returns a list under the hood. The fully_flatten process would first explode the list into rows.
In Example 2, if you use the same root_tag_list, pandas is not reading in a list. Rather, it is reading in a dictionary that corresponds to the single row. In effect, it treats the tags intended as columns to be rows. Instead, I would pass one tag above it as the root tag, then transpose it, then fully_flatten.
Yes... I know... it is bit of a workaround. But... then again, I didn't create pandas-read-xml hoping to solve all the problems. It was always meant to be a interim solution until pandas natively supports reading XML (which it looks like it is coming soon).
Let me know how it goes!
EDIT:
Regarding how to make it so that the XML to pandas DataFrame conversion can switch depending on whether the XML has only one "row" tag or multiple, I have the following two options.
In the many row case, the DataFrame will result in a DataFrame with integer index (row numbers), whereas in the single row case, the DataFrame indices will be "Strings" that were meant to be columns. So one strategy would be to detect that and re-do accordingly. (you could probably avoid double downloading with a smarter approach)
# Import package
import pandas as pd
import pandas_read_xml as pdx
from pandas_read_xml import fully_flatten
# Example 3
dfs = []
url_components = ['1279392/000114554921008161', '1279394/000114554921008162']
for url_component in url_components:
url = f'https://www.sec.gov/Archives/edgar/data/{url_component}/primary_doc.xml'
temp = pdx.read_xml(url, ['edgarSubmission', 'formData', 'invstOrSecs'])
if 0 not in temp.index:
temp = pdx.read_xml(url, ['edgarSubmission', 'formData', 'invstOrSecs'], transpose=True)
else:
temp = pdx.read_xml(url, ['edgarSubmission', 'formData', 'invstOrSecs', 'invstOrSec'])
dfs.append(temp)
df = pd.concat(dfs, ignore_index=True).pipe(fully_flatten)
df
Another option is to use the underlying tools. There is no magic behind pandas_read_xml, it uses a package called xmltodict. Read the XML, convert to dicts, then convert to pandas, and then flatten. The only downside is that because the name of the tag "invstOrSec" is retained, they become prefixes for the column names. You should be able to remove those easily.
# Import package
import pandas as pd
import pandas_read_xml as pdx
import xmltodict
from pandas_read_xml import fully_flatten
# Example 4
url_components = ['1279392/000114554921008161', '1279394/000114554921008162']
xmldicts = []
for url_component in url_components:
url = f'https://www.sec.gov/Archives/edgar/data/{url_component}/primary_doc.xml'
xml = pdx.read_xml_from_url(url)
xmldicts.append(xmltodict.parse(xml)['edgarSubmission']['formData']['invstOrSecs'])
df = pd.DataFrame.from_dict(xmldicts).pipe(fully_flatten)
df
Hope that helps!
EDIT:
So, I've updated the package (now version 0.2.0). Now the pandas_read_xml should treat the root tag as rows in the resulting pandas dataframe as default, so no need to distinguish XMLs that sometimes have single "row" and sometimes having multiple rows.
Should this be an issue in other cases, then there is a new argument root_is_rows that is True by default, but can be made False.