How to Convert an Xml File to Nice Pandas Dataframe

How to convert an XML file to nice pandas dataframe?

You can easily use xml (from the Python standard library) to convert to a pandas.DataFrame. Here's what I would do (when reading from a file replace xml_data with the name of your file or file object):

import pandas as pd
import xml.etree.ElementTree as ET
import io

def iter_docs(author):
author_attr = author.attrib
for doc in author.iter('document'):
doc_dict = author_attr.copy()
doc_dict.update(doc.attrib)
doc_dict['data'] = doc.text
yield doc_dict

xml_data = io.StringIO(u'''YOUR XML STRING HERE''')

etree = ET.parse(xml_data) #create an ElementTree object
doc_df = pd.DataFrame(list(iter_docs(etree.getroot())))

If there are multiple authors in your original document or the root of your XML is not an author, then I would add the following generator:

def iter_author(etree):
for author in etree.iter('author'):
for row in iter_docs(author):
yield row

and change doc_df = pd.DataFrame(list(iter_docs(etree.getroot()))) to doc_df = pd.DataFrame(list(iter_author(etree)))

Have a look at the ElementTree tutorial provided in the xml library documentation.

How to convert XML data as a pandas data frame?

My approach is avoid xml parsing and switch straight into pandas by using xmlplain to generate JSON from xml.

import xmlplain
with open("so_sample.xml") as f: js = xmlplain.xml_to_obj(f, strip_space=True, fold_dict=True)
df1 = pd.json_normalize(js).explode("PublicationDelivery.dataObjects.GeneralFrame.members")

# cleanup column names...
df1 = df1.rename(columns={c:c.replace("PublicationDelivery.", "").replace("dataObjects.GeneralFrame.","").strip()
for c in df1.columns})
# drop spurious columns
df1 = df1.drop(columns=[c for c in df1.columns if c[0]=="@"])
# expand second level of dictionaries
df1 = pd.json_normalize(df1.to_dict(orient="records"))
# cleanup columns from second set of dictionaries
df1 = df1.rename(columns={c:c.replace("members.Quay.", "") for c in df1.columns})
# expand next list and dicts
df1 = pd.json_normalize(df1.explode("Centroid.Location.gml:pos").to_dict(orient="records"))
# there are some NaNs - dela with them
df1["Centroid.Location.gml:pos.@srsName"].fillna(method="ffill", inplace=True)
df1["Centroid.Location.gml:pos"].fillna(method="bfill", inplace=True)
# de-dup
df1 = df1.groupby("@id", as_index=False).first()

# more columns than requested... for SO output
print(df1.loc[:,["Name", "Centroid.Location.gml:pos.@srsName", "Centroid.Location.gml:pos"]].to_string(index=False))

output

                          Name Centroid.Location.gml:pos.@srsName            Centroid.Location.gml:pos
ST FELICIEN - Darone EPSG:2154 829036.2709757038 6444724.878001894
ST FELICIEN - Centre EPSG:2154 828054.2068251468 6444393.512041969
ST FELICIEN - Col de Fontayes EPSG:2154 829504.7993360173 6445490.57188837
ST FELICIEN - Chemin de Juny EPSG:2154 828747.3172982805 6445226.100290826

Read XML file to Pandas DataFrame

if the data is simple, like this, then you can do something like:

from lxml import objectify
xml = objectify.parse('Document1.xml')
root = xml.getroot()

bathrooms = [child.text for child in root['bathrooms'].getchildren()]
price = [child.text for child in root['price'].getchildren()]
property_id = [child.text for child in root['property_id'].getchildren()]

data = [bathrooms, price, property_id]
df = pd.DataFrame(data).T
df.columns = ['bathrooms', 'price', 'property_id']

bathrooms price property_id
0 1.0 7020000.0 35237.0
1 3.0 10000000.0 32238.0
2 nan 4128000.0 44699.0

if it is more complex then a loop is better. You can do something like

from lxml import objectify
xml = objectify.parse('Document1.xml')
root = xml.getroot()

data=[]
for i in range(len(root.getchildren())):
data.append([child.text for child in root.getchildren()[i].getchildren()])

df = pd.DataFrame(data).T
df.columns = ['bathrooms', 'price', 'property_id']

parse xml to pandas data frame in python

The problem in your solution was that the "element data extraction" was not done properly. The xml you mentioned in the question is nested in several layers. And that is why we need to recursively read and extract the data. The following solution should give you what you need in this case. Although I would encourage you to look at this article and the python documentation for more clarity.

Method: 1

import numpy as np
import pandas as pd
#import os
import xml.etree.ElementTree as ET

def xml2df(xml_source, df_cols, source_is_file = False, show_progress=True):
"""Parse the input XML source and store the result in a pandas
DataFrame with the given columns.

For xml_source = xml_file, Set: source_is_file = True
For xml_source = xml_string, Set: source_is_file = False

<element attribute_key1=attribute_value1, attribute_key2=attribute_value2>
<child1>Child 1 Text</child1>
<child2>Child 2 Text</child2>
<child3>Child 3 Text</child3>
</element>
Note that for an xml structure as shown above, the attribute information of
element tag can be accessed by list(element). Any text associated with <element> tag can be accessed
as element.text and the name of the tag itself can be accessed with
element.tag.
"""
if source_is_file:
xtree = ET.parse(xml_source) # xml_source = xml_file
xroot = xtree.getroot()
else:
xroot = ET.fromstring(xml_source) # xml_source = xml_string
consolidator_dict = dict()
default_instance_dict = {label: None for label in df_cols}

def get_children_info(children, instance_dict):
# We avoid using element.getchildren() as it is deprecated.
# Instead use list(element) to get a list of attributes.
for child in children:
#print(child)
#print(child.tag)
#print(child.items())
#print(child.getchildren()) # deprecated method
#print(list(child))
if len(list(child))>0:
instance_dict = get_children_info(list(child),
instance_dict)

if len(list(child.keys()))>0:
items = child.items()
instance_dict.update({key: value for (key, value) in items})

#print(child.keys())
instance_dict.update({child.tag: child.text})
return instance_dict

# Loop over all instances
for instance in list(xroot):
instance_dict = default_instance_dict.copy()
ikey, ivalue = instance.items()[0] # The first attribute is "ID"
instance_dict.update({ikey: ivalue})
if show_progress:
print('{}: {}={}'.format(instance.tag, ikey, ivalue))
# Loop inside every instance
instance_dict = get_children_info(list(instance),
instance_dict)

#consolidator_dict.update({ivalue: instance_dict.copy()})
consolidator_dict[ivalue] = instance_dict.copy()
df = pd.DataFrame(consolidator_dict).T
df = df[df_cols]

return df

Run the following to generate the desired output.

xml_source = r'grade_data.xml'
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", "Question", "Answer",
"ContextRequired", "ExtraInfoInAnswer", "Comments", "Watch", 'ReferenceAnswers']

df = xml2df(xml_source, df_cols, source_is_file = True)
df

Method: 2

Given you have the xml_string, you could convert xml >> dict >> dataframe. run the following to get the desired output.

Note: You will need to install xmltodict to use Method-2. This method is inspired by the solution suggested by @martin-blech at How to convert XML to JSON in Python?
. Kudos to @martin-blech for making it.

pip install -U xmltodict

Solution

def read_recursively(x, instance_dict):  
#print(x)
txt = ''
for key in x.keys():
k = key.replace("@","")
if k in df_cols:
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
#else:
instance_dict.update({k: x.get(key)})
#print('{}: {}'.format(k, x.get(key)))
else:
#print('else: {}: {}'.format(k, x.get(key)))
# dig deeper if value is another dict
if isinstance(x.get(key), dict):
instance_dict, txt = read_recursively(x.get(key), instance_dict)
# add simple text associated with element
if k=='#text':
txt = x.get(key)
# update text to corresponding parent element
if (k!='#text') and (txt!=''):
instance_dict.update({k: txt})
return (instance_dict, txt)

You will need the function read_recursively() given above. Now run the following.

import xmltodict, json

o = xmltodict.parse(xml_string) # INPUT: XML_STRING
#print(json.dumps(o)) # uncomment to see xml to json converted string

consolidated_dict = dict()
oi = o['Instances']['Instance']

for x in oi:
instance_dict = dict()
instance_dict, _ = read_recursively(x, instance_dict)
consolidated_dict.update({x.get("@ID"): instance_dict.copy()})
df = pd.DataFrame(consolidated_dict).T
df = df[df_cols]
df

Python: Extracting XML to DataFrame (Pandas)

As advised in this solution by gold member Python/pandas/numpy guru, @unutbu:

Never call DataFrame.append or pd.concat inside a for-loop. It leads to quadratic copying.

Therefore, consider parsing your XML data into a separate list then pass list into the DataFrame constructor in one call outside of any loop. In fact, you can pass nested lists with list comprehension directly into the constructor:

path = 'AttributesXMLPandas.xml'
dfcols = ['ID', 'Text', 'CreationDate']

root = et.parse(path)
rows = root.findall('.//row')

# NESTED LIST
xml_data = [[row.get('Id'), row.get('Text'), row.get('CreationDate')]
for row in rows]

df_xml = pd.DataFrame(xml_data, columns=dfcols)

print(df_xml)

# ID Text CreationDate
# 0 1 (...) 2011-08-30T21:15:28.063
# 1 2 (...) 2011-08-30T21:24:56.573
# 2 3 (...) None

How to convert an XML file to pandas dataframe?

Try using:

from bs4 import BeautifulSoup
import pandas as pd

name_list = []

prev_df = pd.DataFrame(columns=['time'])
response = BeautifulSoup(get_xml(), 'lxml')
for x in response.find_all('edpobjectbe'):
list_small = list()

name = str(x.find('edpobjectname').text).strip()
name_list.append(name)

data = x.find_all('edpenergydatabe')
print(name)
for y in data:

applicableat = str(y.find('applicableat').text).strip()
flowrate = str(y.find('flowrate').text).strip()

list_small.append([applicableat, flowrate])
df = pd.DataFrame(list_small, columns=['time', name])
prev_df = pd.DataFrame.merge(prev_df, df, how='right', on='time')
print(prev_df)

Check if this works for you!!!



Related Topics



Leave a reply



Submit