Convert an Xml File to CSV File Using Java

Convert an XML file to CSV file using java

here's a working example, data.xml has your data:

import java.io.File;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.w3c.dom.Document;

class Xml2Csv {

public static void main(String args[]) throws Exception {
File stylesheet = new File("src/main/resources/style.xsl");
File xmlSource = new File("src/main/resources/data.xml");

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(xmlSource);

StreamSource stylesource = new StreamSource(stylesheet);
Transformer transformer = TransformerFactory.newInstance()
.newTransformer(stylesource);
Source source = new DOMSource(document);
Result outputTarget = new StreamResult(new File("/tmp/x.csv"));
transformer.transform(source, outputTarget);
}
}

style.xsl

<?xml version="1.0"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:fo="http://www.w3.org/1999/XSL/Format" >
<xsl:output method="text" omit-xml-declaration="yes" indent="no"/>
<xsl:template match="/">
Host_Name,IP_address,OS,Load_avg_1min,Load_avg_5min,Load_avg_15min
<xsl:for-each select="//Host">
<xsl:value-of select="concat(Host_Name,',',IP_address,',',OS,Load_avg_1min,',',Load_avg_5min,',',Load_avg_15min,' ')"/>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>

output:

Host_Name,IP_address,OS,Load_avg_1min,Load_avg_5min,Load_avg_15min
srv001001,10.1.2.3,Windows1.3,2.5,1.2
srv001002,10.1.2.4,Linux1.4,2.5,1.2
srv001003,10.1.2.5,Linux3.3,1.6,1.8
srv001004,10.1.2.6,Linux2.3,4.5,4.2

Convert XML file to CSV in java

The code provided should be considered a sketch rather than the definitive article. I am not an expert on SAX and the implementation could be improved for better performance, simpler code etc. That said SAX should be able to cope with streaming large XML files.

I would approach this problem with 2 passes using the SAX parser. (Incidentally, I would also use a CSV generating library to create the output as this would deal with all the fiddly character escaping that CSV involves but I haven't implemented this in my sketch).

First pass:
Establish number of header columns

Second pass:
Output CSV

I assume that the XML file is well formed. I assume that we don't have a scheme/DTD with a predefined order.

In the first pass I have assumed that a CSV column will be added for every XML element containing text content or for any attribute (I have assumed attributes will contain something!).

The second pass, having established the number of target columns, will do the actual CSV output.

Based on your example XML my code sketch would produce:

ItemID,StartTime,EndTime,ViewItemURL,AverageTime,category,category,type,type,AveragePrice
4504216603,10:00:10.000Z,10:00:30.000Z,http://url,,,,,,
4504216604,10:30:10.000Z,11:00:10.000Z,http://url,value1,9823,9112,TX,TY,value2

Please note I have used the google collections LinkedHashMultimap as this is helpful when associating multiple values with a single key. I hope you find this useful!

import com.google.common.collect.LinkedHashMultimap;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map.Entry;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

public class App {

public static void main(String[] args) throws SAXException, FileNotFoundException, IOException {
// First pass - to determine headers
XMLReader xr = XMLReaderFactory.createXMLReader();
HeaderHandler handler = new HeaderHandler();
xr.setContentHandler(handler);
xr.setErrorHandler(handler);
FileReader r = new FileReader("test1.xml");
xr.parse(new InputSource(r));

LinkedHashMap<String, Integer> headers = handler.getHeaders();
int totalnumberofcolumns = 0;
for (int headercount : headers.values()) {
totalnumberofcolumns += headercount;
}
String[] columnheaders = new String[totalnumberofcolumns];
int i = 0;
for (Entry<String, Integer> entry : headers.entrySet()) {
for (int j = 0; j < entry.getValue(); j++) {
columnheaders[i] = entry.getKey();
i++;
}
}
StringBuilder sb = new StringBuilder();
for (String h : columnheaders) {
sb.append(h);
sb.append(',');
}
System.out.println(sb.substring(0, sb.length() - 1));

// Second pass - collect and output data

xr = XMLReaderFactory.createXMLReader();

DataHandler datahandler = new DataHandler();
datahandler.setHeaderArray(columnheaders);

xr.setContentHandler(datahandler);
xr.setErrorHandler(datahandler);
r = new FileReader("test1.xml");
xr.parse(new InputSource(r));
}

public static class HeaderHandler extends DefaultHandler {

private String content;
private String currentElement;
private boolean insideElement = false;
private Attributes attribs;
private LinkedHashMap<String, Integer> itemHeader;
private LinkedHashMap<String, Integer> accumulativeHeader = new LinkedHashMap<String, Integer>();

public HeaderHandler() {
super();
}

private LinkedHashMap<String, Integer> getHeaders() {
return accumulativeHeader;
}

private void addItemHeader(String headerName) {
if (itemHeader.containsKey(headerName)) {
itemHeader.put(headerName, itemHeader.get(headerName) + 1);
} else {
itemHeader.put(headerName, 1);
}
}

@Override
public void startElement(String uri, String name,
String qName, Attributes atts) {
if ("item".equalsIgnoreCase(qName)) {
itemHeader = new LinkedHashMap<String, Integer>();
}
currentElement = qName;
content = null;
insideElement = true;
attribs = atts;
}

@Override
public void endElement(String uri, String name, String qName) {
if (!"item".equalsIgnoreCase(qName) && !"root".equalsIgnoreCase(qName)) {
if (content != null && qName.equals(currentElement) && content.trim().length() > 0) {
addItemHeader(qName);
}
if (attribs != null) {
int attsLength = attribs.getLength();
if (attsLength > 0) {
for (int i = 0; i < attsLength; i++) {
String attName = attribs.getLocalName(i);
addItemHeader(attName);
}
}
}
}
if ("item".equalsIgnoreCase(qName)) {
for (Entry<String, Integer> entry : itemHeader.entrySet()) {
String headerName = entry.getKey();
Integer count = entry.getValue();
//System.out.println(entry.getKey() + ":" + entry.getValue());
if (accumulativeHeader.containsKey(headerName)) {
if (count > accumulativeHeader.get(headerName)) {
accumulativeHeader.put(headerName, count);
}
} else {
accumulativeHeader.put(headerName, count);
}
}
}
insideElement = false;
currentElement = null;
attribs = null;
}

@Override
public void characters(char ch[], int start, int length) {
if (insideElement) {
content = new String(ch, start, length);
}
}
}

public static class DataHandler extends DefaultHandler {

private String content;
private String currentElement;
private boolean insideElement = false;
private Attributes attribs;
private LinkedHashMultimap dataMap;
private String[] headerArray;

public DataHandler() {
super();
}

@Override
public void startElement(String uri, String name,
String qName, Attributes atts) {
if ("item".equalsIgnoreCase(qName)) {
dataMap = LinkedHashMultimap.create();
}
currentElement = qName;
content = null;
insideElement = true;
attribs = atts;
}

@Override
public void endElement(String uri, String name, String qName) {
if (!"item".equalsIgnoreCase(qName) && !"root".equalsIgnoreCase(qName)) {
if (content != null && qName.equals(currentElement) && content.trim().length() > 0) {
dataMap.put(qName, content);
}
if (attribs != null) {
int attsLength = attribs.getLength();
if (attsLength > 0) {
for (int i = 0; i < attsLength; i++) {
String attName = attribs.getLocalName(i);
dataMap.put(attName, attribs.getValue(i));
}
}
}
}
if ("item".equalsIgnoreCase(qName)) {
String data[] = new String[headerArray.length];
int i = 0;
for (String h : headerArray) {
if (dataMap.containsKey(h)) {
Object[] values = dataMap.get(h).toArray();
data[i] = (String) values[0];
if (values.length > 1) {
dataMap.removeAll(h);
for (int j = 1; j < values.length; j++) {
dataMap.put(h, values[j]);
}
} else {
dataMap.removeAll(h);
}
} else {
data[i] = "";
}
i++;
}
StringBuilder sb = new StringBuilder();
for (String d : data) {
sb.append(d);
sb.append(',');
}
System.out.println(sb.substring(0, sb.length() - 1));
}
insideElement = false;
currentElement = null;
attribs = null;
}

@Override
public void characters(char ch[], int start, int length) {
if (insideElement) {
content = new String(ch, start, length);
}
}

public void setHeaderArray(String[] headerArray) {
this.headerArray = headerArray;
}
}
}

Convert XML having multiple for-each case to CSV

I guess the inner <xsl:for-each select="/TURFS/TURF"> should be <xsl:for-each select="TURFS/TURF">. And concat(AREANAME,',',DEAFAULT should be concat(AREANAME,',',DEFAULT.

XML To CSV Conversion Java

I suggest you use this as your starting point:

XSLT 1.0

<xsl:stylesheet version="1.0" 
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="text" encoding="UTF-8"/>

<xsl:template match="/school">
<!-- header -->
<xsl:text>classroom id,classroom_name,teacher_1_id,teacher_1_last_name,teacher_1_first_name,teacher_2_id,teacher_2_last_name,teacher_2_first_name,student_id,student_last_name,student_first_name,grade </xsl:text>
<!-- data -->
<xsl:for-each select="grade/classroom">
<!-- classroom data -->
<xsl:variable name="classroom-data">
<xsl:value-of select="@id" />
<xsl:text>,</xsl:text>
<xsl:value-of select="@name" />
<xsl:text>,</xsl:text>
<xsl:value-of select="teacher[1]/@id" />
<xsl:text>,</xsl:text>
<xsl:value-of select="teacher[1]/@last_name" />
<xsl:text>,</xsl:text>
<xsl:value-of select="teacher[1]/@first_name" />
<xsl:text>,</xsl:text>
<xsl:value-of select="teacher[2]/@id" />
<xsl:text>,</xsl:text>
<xsl:value-of select="teacher[2]/@last_name" />
<xsl:text>,</xsl:text>
<xsl:value-of select="teacher[2]/@first_name" />
<xsl:text>,</xsl:text>
</xsl:variable>
<xsl:variable name="grade-id" select="../@id" />
<xsl:for-each select="student">
<xsl:copy-of select="$classroom-data"/>
<!-- student data -->
<xsl:value-of select="@id" />
<xsl:text>,</xsl:text>
<xsl:value-of select="@last_name" />
<xsl:text>,</xsl:text>
<xsl:value-of select="@first_name" />
<xsl:text>,</xsl:text>
<xsl:value-of select="$grade-id" />
<xsl:text> </xsl:text>
</xsl:for-each>
</xsl:for-each>
</xsl:template>

</xsl:stylesheet>

Applied to your input, the result will be:

classroom id,classroom_name,teacher_1_id,teacher_1_last_name,teacher_1_first_name,teacher_2_id,teacher_2_last_name,teacher_2_first_name,student_id,student_last_name,student_first_name,grade
101,Mrs. Jones' Math Class,10100000001,Jones,Barbara,,,,10100000010,Gil,Michael,1
101,Mrs. Jones' Math Class,10100000001,Jones,Barbara,,,,10100000011,Gutierrez,Kimberly,1
101,Mrs. Jones' Math Class,10100000001,Jones,Barbara,,,,10100000013,Mercado,Toby,1
101,Mrs. Jones' Math Class,10100000001,Jones,Barbara,,,,10100000014,Garcia,Lizzie,1
101,Mrs. Jones' Math Class,10100000001,Jones,Barbara,,,,10100000015,Cruz,Alex,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000010,Smith,Nathaniel,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000011,McCrancy,Brandon,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000012,Marco,Elizabeth,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000013,Lanni,Erica,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000014,Flores,Michael,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000015,Hill,Jasmin,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000016,Perez,Brittany,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000017,Hiram,William,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000018,Reginald,Alexis,1
102,Mr. Smith's PhysEd Class,10200000001,Smith,Arthur,10200000011,Patterson,John,10200000019,Gayle,Matthew,1

Note that this assumes your input fields contain no commas or double quotes.

XML to CSV conversion as fast as possible for medium sized files

You can try to use Java StAX API for this propose.

For example:

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

public class XmlToCSV {

public static void convert(InputStream xml, OutputStream csv) throws Exception {
try (StringWriter header = new StringWriter(4096); StringWriter content = new StringWriter(4096)) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = factory.createXMLEventReader(xml);
XMLEvent xmlEvent;
long nestingLevel = -1;
StringBuilder line = null;
while (xmlEventReader.hasNext()) {
xmlEvent = xmlEventReader.nextEvent();
switch (xmlEvent.getEventType()) {
case XMLEvent.START_ELEMENT:
++nestingLevel;
if (0 == nestingLevel) {
break;
} else if (1 == nestingLevel) {
line = new StringBuilder();
}
StartElement startElement = xmlEvent.asStartElement();
serializeElementHeader(header, line, startElement);
break;
case XMLEvent.CHARACTERS:
case XMLEvent.CDATA:
if (nestingLevel < 1)
break;
Characters chars = xmlEvent.asCharacters();
if (!chars.isWhiteSpace()) {
line.append(chars.getData());
line.append(',');
}
break;
case XMLEvent.END_ELEMENT:
if (--nestingLevel == 0) {
header.write("0/");
for(int i= line.length()-1; ',' == line.charAt(i); --i) {
line.deleteCharAt(i);
}
content.write(line.toString());
content.write('\n');
}
break;
default:
break;
}
}
// write csv
try (Writer cvsWriter = new OutputStreamWriter(csv, StandardCharsets.UTF_8.name())) {
cvsWriter.write(header.toString());
cvsWriter.write('\n');
cvsWriter.write(content.toString());
}
}
}

private static void serializeElementHeader(StringWriter header, StringBuilder line,
StartElement startElement) {
header.write(startElement.getName().getLocalPart());
header.write('/');
Iterator<Attribute> it = startElement.getAttributes();
while(it.hasNext()) {
Attribute attr = it.next();
header.write('_');
header.write(attr.getName().getLocalPart());
header.write('/');
line.append(attr.getValue());
line.append(',');
}
}

private static String TEST_XML = "<?xml version='1.0'?>"
+ "<Company>"
+ " <Employee id='1'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id32' type='AttributesInContext'>"
+ " <UserValue value='7in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ " <Employee id='2'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id33' type='AttributesInContext'>"
+ " <UserValue value='6in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ " <Employee id='3'>"
+ " <Email>tp@xyz.com</Email>"
+ " <UserData id='id34' type='AttributesInContext'>"
+ " <UserValue value='4in' title='Heigh'></UserValue>"
+ " <UserValue value='' title='Weight'></UserValue>"
+ " </UserData>"
+ " </Employee>"
+ "</Company>";

public static void main(String[] args) throws Exception {

try (InputStream in = new ByteArrayInputStream(TEST_XML.getBytes(Charset.defaultCharset()));
ByteArrayOutputStream out = new ByteArrayOutputStream(4096)) {
convert(in, out);
System.out.print(out.toString());
}

}

How can we convert XML file to CSV?

In pseudo code:

loop through the rows:
loop through all children of `Row`:
write out the text
append a comma
new line

That quick little loop will write a comma at the end of each line, but I'm sure you can figure out how to remove that.

For actually parsing the XML, I suggest using JDOM. It has a pretty intuitive API.



Related Topics



Leave a reply



Submit