JAVA原生API读取XML大文件的DOM方式和SAX方式比较
分类专栏: J2SE综合 文章标签: XMLJavaEclipseJDKApache
一直都在使用dom的方式读取xml文件,但如果稍大点的xml文件那么dom方式就有点不太适合。
研究了下jdk的api,用dom和sax方式的解析结果做了个对比
要解析的xml内容格式如下
<?xml version="1.0" encoding="UTF-8"?> <urlset> <url> <loc>商品链接访问地址</loc> <data> <display> <title>商品名称</title> <price>价格</price> <image> 商品图片访问地址 </image> <description>商品描述</description> <barCode>条形码值</barCode> <area>产地 (北京)</area> <producedate>生产日期 (2011-11-11)</producedate> <manufacturers>生产厂家 (某某某)</manufacturers> </display> </data> </url> //.....更更多 </urlset>
xml文件大小16.5M
首先是dom方式读取,代码如下
-
package test.xml;
-
import java.util.ArrayList;
-
import java.util.HashMap;
-
import java.util.List;
-
import java.util.Map;
-
import java.util.Set;
-
import javax.xml.parsers.DocumentBuilderFactory;
-
import org.w3c.dom.Document;
-
import org.w3c.dom.Element;
-
import org.w3c.dom.Node;
-
import org.w3c.dom.NodeList;
-
public class JDKBigXmlDomParse {
-
private int statmentSize = 6;
-
private List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>(statmentSize);
-
public void test() throws Exception{
-
String uri = "f:\\test.xml";
-
Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(uri);
-
NodeList urls = doc.getElementsByTagName("url");
-
int length = urls.getLength();
-
for(int i=0;i<length;i++){
-
Node node = urls.item(i);
-
if(node.getNodeType() != Node.ELEMENT_NODE){
-
continue;
-
}
-
Element urlElement = (Element)node;
-
Map<String,Object> entry = parseEntity(urlElement);
-
if(!entry.isEmpty()){
-
dataList.add(entry);
-
if(dataList.size() == statmentSize){
-
doSomeThing();
-
}
-
}
-
}
-
}
-
private Map<String,Object> parseEntity(Element element){
-
Map<String,Object> map = new HashMap<String, Object>();
-
map.put("loc", getElementValueByTagName(element,"loc"));
-
map.put("title", getElementValueByTagName(element,"title"));
-
map.put("price", getElementValueByTagName(element,"price"));
-
map.put("image", getElementValueByTagName(element,"image"));
-
map.put("description", getElementValueByTagName(element,"description"));
-
map.put("barCode", getElementValueByTagName(element,"barCode"));
-
map.put("area", getElementValueByTagName(element,"area"));
-
map.put("producedate", getElementValueByTagName(element,"producedate"));
-
map.put("manufacturers", getElementValueByTagName(element,"manufacturers"));
-
return map;
-
}
-
private String getElementValueByTagName(Element element,String tagName){
-
NodeList nodeList = element.getElementsByTagName(tagName);
-
String value = "";
-
if(nodeList.getLength() != 0){
-
Node node = nodeList.item(0);
-
value = node.getFirstChild().getNodeValue().trim();
-
}
-
return value;
-
}
-
private void doSomeThing(){
-
//printMapList(dataList);
-
dataList.clear();
-
}
-
private void printMapList(List<Map<String,Object>> dataList){
-
boolean first = true;
-
for(Map<String,Object> map:dataList){
-
System.out.println();
-
System.out.print("{");
-
Set<Map.Entry<String, Object>> entries = map.entrySet();
-
for(Map.Entry<String, Object> entry:entries){
-
if(!first){
-
System.out.print(",");
-
}
-
System.out.print("\""+entry.getKey()+"\":");
-
System.out.print("\""+entry.getValue()+"\"");
-
first = false;
-
}
-
first = true;
-
System.out.print("}");
-
}
-
System.out.println();
-
}
-
public static void main(String[] args) throws Exception{
-
long start = System.nanoTime();
-
new JDKBigXmlDomParse().test();
-
long end = System.nanoTime();
-
System.out.println("耗时:"+(end-start)/1000000000.0+"秒");
-
}
-
}
运行结果:
耗时:3.212168172秒
sax方式读取,代码如下:
-
package test.xml;
-
import java.io.FileInputStream;
-
import java.io.InputStream;
-
import java.util.ArrayList;
-
import java.util.HashMap;
-
import java.util.List;
-
import java.util.Map;
-
import java.util.Set;
-
import javax.xml.parsers.SAXParser;
-
import javax.xml.parsers.SAXParserFactory;
-
import org.apache.commons.lang.StringUtils;
-
import org.xml.sax.Attributes;
-
import org.xml.sax.SAXException;
-
import org.xml.sax.helpers.DefaultHandler;
-
public class JDKBigXmlSaxParse extends DefaultHandler {
-
private int statmentSize = 6;
-
private List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>(statmentSize);
-
private Map<String, Object> dataMap;
-
private String currentTag = "";
-
public void test() throws Exception {
-
SAXParser sax = SAXParserFactory.newInstance().newSAXParser();
-
InputStream in = new FileInputStream("f:\\test.xml");
-
sax.parse(in, this);
-
in.close();
-
}
-
@Override
-
public void characters(char[] ch, int start, int length)throws SAXException {
-
String value = new String(ch, start, length);
-
if(!StringUtils.isBlank(value)){
-
dataMap.put(currentTag, value.trim());
-
}
-
}
-
@Override
-
public void endElement(String uri, String localName, String qName)
-
throws SAXException {
-
if("url".equals(qName)){
-
dataList.add(dataMap);
-
//dataMap.clear();
-
}
-
if(dataList.size() == statmentSize){
-
doSomeThing();
-
dataList.clear();
-
}
-
if("urlset".equals(qName) && dataList.size() != 0){
-
doSomeThing();
-
dataList.clear();
-
}
-
}
-
@Override
-
public void startElement(String uri, String localName, String qName,
-
Attributes attributes) throws SAXException {
-
if ("url".equals(qName)) {
-
dataMap = new HashMap<String, Object>();
-
return;
-
}
-
currentTag = qName;
-
}
-
public static void main(String[] args) throws Exception {
-
long start = System.nanoTime();
-
new JDKBigXmlSaxParse().test();
-
long end = System.nanoTime();
-
System.out.println("耗时:"+(end-start)/1000000000.0+"秒");
-
}
-
public void doSomeThing(){
-
//printMapList(dataList);
-
}
-
private void printMapList(List<Map<String,Object>> dataList){
-
boolean first = true;
-
for(Map<String,Object> map:dataList){
-
System.out.println();
-
System.out.print("{");
-
Set<Map.Entry<String, Object>> entries = map.entrySet();
-
for(Map.Entry<String, Object> entry:entries){
-
if(!first){
-
System.out.print(",");
-
}
-
System.out.print("\""+entry.getKey()+"\":");
-
System.out.print("\""+entry.getValue()+"\"");
-
first = false;
-
}
-
first = true;
-
System.out.print("}");
-
}
-
System.out.println();
-
}
-
}
运行结果:
耗时:0.639864769秒
可以看到dom消耗的时间是sax方式的5倍。结论:如果只是读取xml文件,还是sax方式强。。。
而且在eclipse里面用dom方式运行的时候可能会出现eclipse java.lang.OutOfMemoryError: Java heap space这个问题