Using Python Iterparse for Large Xml Files

Using Python Iterparse For Large XML Files

Try Liza Daly's fast_iter. After processing an element, elem, it calls elem.clear() to remove descendants and also removes preceding siblings.

def fast_iter(context, func, *args, **kwargs):
    """
    http://lxml.de/parsing.html#modifying-the-tree
    Based on Liza Daly's fast_iter
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    See also http://effbot.org/zone/element-iterparse.htm
    """
    for event, elem in context:
        func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
    del context

def process_element(elem):
    print elem.xpath( 'description/text( )' )

context = etree.iterparse( MYFILE, tag='item' )
fast_iter(context,process_element)

Daly's article is an excellent read, especially if you are processing large XML files.

Edit: The fast_iter posted above is a modified version of Daly's fast_iter. After processing an element, it is more aggressive at removing other elements that are no longer needed.

The script below shows the difference in behavior. Note in particular that orig_fast_iter does not delete the A1 element, while the mod_fast_iter does delete it, thus saving more memory.

import lxml.etree as ET
import textwrap
import io

def setup_ABC():
    content = textwrap.dedent('''\
      <root>
        <A1>
          <B1></B1>
          <C>1<D1></D1></C>
          <E1></E1>
        </A1>
        <A2>
          <B2></B2>
          <C>2<D></D></C>
          <E2></E2>
        </A2>
      </root>
        ''')
    return content

def study_fast_iter():
    def orig_fast_iter(context, func, *args, **kwargs):
        for event, elem in context:
            print('Processing {e}'.format(e=ET.tostring(elem)))
            func(elem, *args, **kwargs)
            print('Clearing {e}'.format(e=ET.tostring(elem)))
            elem.clear()
            while elem.getprevious() is not None:
                print('Deleting {p}'.format(
                    p=(elem.getparent()[0]).tag))
                del elem.getparent()[0]
        del context

    def mod_fast_iter(context, func, *args, **kwargs):
        """
        http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
        Author: Liza Daly
        See also http://effbot.org/zone/element-iterparse.htm
        """
        for event, elem in context:
            print('Processing {e}'.format(e=ET.tostring(elem)))
            func(elem, *args, **kwargs)
            # It's safe to call clear() here because no descendants will be
            # accessed
            print('Clearing {e}'.format(e=ET.tostring(elem)))
            elem.clear()
            # Also eliminate now-empty references from the root node to elem
            for ancestor in elem.xpath('ancestor-or-self::*'):
                print('Checking ancestor: {a}'.format(a=ancestor.tag))
                while ancestor.getprevious() is not None:
                    print(
                        'Deleting {p}'.format(p=(ancestor.getparent()[0]).tag))
                    del ancestor.getparent()[0]
        del context

    content = setup_ABC()
    context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
    orig_fast_iter(context, lambda elem: None)
    # Processing <C>1<D1/></C>
    # Clearing <C>1<D1/></C>
    # Deleting B1
    # Processing <C>2<D/></C>
    # Clearing <C>2<D/></C>
    # Deleting B2

    print('-' * 80)
    """
    The improved fast_iter deletes A1. The original fast_iter does not.
    """
    content = setup_ABC()
    context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
    mod_fast_iter(context, lambda elem: None)
    # Processing <C>1<D1/></C>
    # Clearing <C>1<D1/></C>
    # Checking ancestor: root
    # Checking ancestor: A1
    # Checking ancestor: C
    # Deleting B1
    # Processing <C>2<D/></C>
    # Clearing <C>2<D/></C>
    # Checking ancestor: root
    # Checking ancestor: A2
    # Deleting A1
    # Checking ancestor: C
    # Deleting B2

study_fast_iter()

How to iteratively parse a large XML file in Python?

Iterating over a huge XML file is always painful.

I'll go over all the process from start to finish, suggesting the best practices for keeping low memory yet maximizing parsing speed.

First no need to store ET.iterparse as a variable. Just iterate over it like

for event, elem in ET.iterparse(xml_file, events=("start", "end")):
This iterator created for, well..., iteration without storing anything else in memory except the current tag. Also you don't need root.clear() with this new approach and you can go as long as your hard disk space allows it for huge XML files.

Your code should look like:

from xml.etree import cElementTree as ET

def get_all_records(xml_file_path, record_category, name_types, name_components):
    all_records = []
    for event, elem in ET.iterparse(xml_file_path, events=("start", "end")):
        if event == 'end' and elem.tag == record_category and elem.attrib['action'] != 'del':
            record_contents = get_record(elem, name_types=name_types, name_components=name_components, record_id=elem.attrib['id'])
            if record_contents:
                all_records += record_contents
    return all_records

Also, please think carefully about the reason you need to store the whole list of all_records. If it's only for writing CSV file at the end of the process - this reason isn't good enough and can cause memory issues when scaling to even bigger XML files.

Make sure you write each new row to CSV as this row happens, turning memory issues into none-issue.

P.S.

If you need to store several tags before you find your main tag in order to parse this historic information as you go down the XML file - just store it locally in some new variables. This comes handy whenever future data in XML file makes you go backwards to a specific tag you know already occured.

iterparse large XML using python

Here is a working program that illustrates how to use .iterparse() from cElementTree, storing the results in a database. Note that this program is aware of the namespace used in the input XML.

The i.xml is identical to the example XML given in the question.

# Tested on Python 2.6.7, Ubuntu 14.04.4
import xml.etree.cElementTree as et
import sqlite3

# Tools to deal with namespaces
ixid_uri = 'http://www.website.com/ixid/xmlfile/v8'
def extract_local_tag(qname):
    return qname.split('}')[-1]

# A db connection to illustrate the example
conn = sqlite3.connect(":memory:")
c = conn.cursor()
c.execute("create table foo (joury_uid text, tag text, tpl text)")
conn.commit()

# The main part of the code: iterate over the XML,
# storing DB stuff at the end of every <Journey>
with open('i.xml') as xml_file:
    for event, elem in et.iterparse(xml_file):
        # Must compare tag to qualified name
        if elem.tag == et.QName(ixid_uri, 'Journey'):
            c.executemany('insert into foo values(?, ?, ?)',
                [
                    (elem.attrib['uid'],
                    extract_local_tag(child.tag),
                    child.attrib.get('tpl', None))
                    for child in elem
                ])
            conn.commit()
            # Note: only clears <Journey> elements and their children.
            # There is a memory leak of any elements not children of <Journey>
            elem.clear()    
for row in c.execute('select * from foo'):
    print row

Result:

(u'G61365', u'OR', u'PERTH')
(u'G61365', u'PP', u'HILTONJ')
...
(u'G61365', u'DT', u'GLGC')
(u'G64015', u'OR', u'GLGQLL')
(u'G64015', u'PP', u'FNSTNEJ')
...

References:

http://effbot.org/zone/element-namespaces.htm
http://effbot.org/zone/pythondoc-elementtree-ElementTree.htm#elementtree.ElementTree.QName-class
http://effbot.org/zone/element-iterparse.htm

Large XML File Parsing in Python

Consider iterparse for fast streaming processing that builds tree incrementally. In each iteration build a list of dictionaries that you can then pass into pandas.DataFrame constructor once outside loop. Adjust below to name of repeating nodes of root's children:

from xml.etree.ElementTree import iterparse
#from cElementTree import iterparse
import pandas as pd

file_path = r"/path/to/Input.xml"
dict_list = []

for _, elem in iterparse(file_path, events=("end",)):
    if elem.tag == "row":
        dict_list.append({'rowId': elem.attrib['Id'],
                          'UserId': elem.attrib['UserId'],
                          'Name': elem.attrib['Name'],
                          'Date': elem.attrib['Date'],
                          'Class': elem.attrib['Class'],
                          'TagBased': elem.attrib['TagBased']})

        # dict_list.append(elem.attrib)      # ALTERNATIVELY, PARSE ALL ATTRIBUTES

        elem.clear()

df = pd.DataFrame(dict_list)

Parsing large XML file with lxml

You can use etree.iterparse to avoid loading the whole file in memory:

events = ("start", "end")
with open("dblp.xml", "r") as fo:
    context = etree.iterparse(fo, events=events)
    for action, elem in context:
        # Do something

This will allow you to only extract entities you need while ignoring others.

Python lxml iterparse sort by attribute large xml file

import lxml.etree as ET
from copy import deepcopy

xml_source = 'ss_sky_sw_xmltv.xml'
xml_output = 'ss_sky_sw_xmltv_parsed.xml'
# icons with these dimensions (width, height) will be removed:
remove_dimensions = (
    (180, 135),
    (120, 180),
    )

tree = ET.parse(xml_source)
root = tree.getroot()
for programme in root.iterfind('programme'):
    # Create copy of all icons to reinsert them in the right order
    icons = deepcopy(sorted(programme.findall('icon'), key=lambda x: int(x.attrib['height'])))
    # Remove all icons from programme
    for old_icon in programme.findall('icon'):
        programme.remove(old_icon)

    # Reinsert the items
    for new_icon in icons:
        # Create a dict to compare
        dimensions = int(new_icon.attrib['width']), int(new_icon.attrib['height'])
        # Compare the dict if it should be removed (not included again)
        if dimensions not in remove_dimensions:
            programme.append(new_icon)

# Save the file
tree.write(xml_output, xml_declaration=True, pretty_print=True)

Using Python Iterparse for Large Xml Files