遇到的问题
本人需要解析Stack Overflow的dump文件(xml格式)将数据其存入数据库,其中关于Stack Overflow帖子(Posts)的xml文件超过了60G。
那么如何解析那么大的xml文件呢(Stack Overflow上有解决方案-链接)?
解决方案
或许你已经想到了分块读取,然后解析。那么如何分块解析呢?Java中处理xml文件有两种处理方案(DOM和Event Driven)。DOM需要将所有文件读取,在内存中构建DOM树,显然这种方案不行;只能选择基于事件的方式,本人选择了StAX,或许用SAX的人比较多,那么SAX和StAX的区别是什么呢?,这是说明: SAX vs StAX。
StAX的学习教程可以在:http://tutorials.jenkov.com/java-xml/index.html 和 https://docs.oracle.com/javase/tutorial/jaxp/ 中寻找。
你需要了解的是:什么是事件驱动。(懂得这个,也就懂得了StAX为什么是分块处理了)
代码实现
Posts.xml预览(使用IntelliJ IDEA可以预览超大xml文件)
<?xml version="1.0" encoding="utf-8"?>
<posts>
<row Id="4" PostTypeId="1" AcceptedAnswerId="7" CreationDate="2008-07-31T21:42:52.667" Score="573" ViewCount="37080" Body="" OwnerUserId="8" LastEditorUserId="6786713" LastEditorDisplayName="Rich B" LastEditDate="2018-07-02T17:55:27.247" LastActivityDate="2018-07-02T17:55:27.247" Title="Convert Decimal to Double?" Tags="<c#><floating-point><type-conversion><double><decimal>" AnswerCount="13" CommentCount="1" FavoriteCount="41" CommunityOwnedDate="2012-10-31T16:42:47.213" />
<row Id="6" PostTypeId="1" AcceptedAnswerId="31" CreationDate="2008-07-31T22:08:08.620" Score="256" ViewCount="16306" Body="" OwnerUserId="9" LastEditorUserId="63550" LastEditorDisplayName="Rich B" LastEditDate="2016-03-19T06:05:48.487" LastActivityDate="2016-03-19T06:10:52.170" Title="" Tags="<html><css><css3><internet-explorer-7>" AnswerCount="5" CommentCount="0" FavoriteCount="10" />
<row Id="7" PostTypeId="2" ParentId="4" CreationDate="2008-07-31T22:17:57.883" Score="401" Body="" OwnerUserId="9" LastEditorUserId="4020527" LastEditDate="2017-12-16T05:06:57.613" LastActivityDate="2017-12-16T05:06:57.613" CommentCount="0" />
<row Id="9" PostTypeId="1" AcceptedAnswerId="1404" CreationDate="2008-07-31T23:40:59.743" Score="1743" ViewCount="480476" Body="" OwnerUserId="1" LastEditorUserId="3956566" LastEditorDisplayName="Rich B" LastEditDate="2018-04-21T17:48:14.477" LastActivityDate="2018-07-25T11:57:14.110" Title="How do I calculate someone's age in C#?" Tags="<c#><.net><datetime>" AnswerCount="64" CommentCount="7" FavoriteCount="399" CommunityOwnedDate="2011-08-16T19:40:43.080" />
<row Id="11" PostTypeId="1" AcceptedAnswerId="1248" CreationDate="2008-07-31T23:55:37.967" Score="1348" ViewCount="136033" Body="" OwnerUserId="1" LastEditorUserId="6479704" LastEditorDisplayName="user2370523" LastEditDate="2017-06-04T15:51:19.780" LastActivityDate="2018-07-05T04:00:56.633" Title="Calculate relative time in C#" Tags="<c#><datetime><time><datediff><relative-time-span>" AnswerCount="35" CommentCount="3" FavoriteCount="529" CommunityOwnedDate="2009-09-04T13:15:59.820" />
<row Id="12" PostTypeId="2" ParentId="11" CreationDate="2008-07-31T23:56:41.303" Score="320" Body="" OwnerUserId="1" LastEditorUserId="1271898" LastEditorDisplayName="GateKiller" LastEditDate="2018-01-12T16:10:22.637" LastActivityDate="2018-01-12T16:10:22.637" CommentCount="11" CommunityOwnedDate="2009-09-04T13:15:59.820" />
<row Id="13" PostTypeId="1" CreationDate="2008-08-01T00:42:38.903" Score="539" ViewCount="157009" Body="" OwnerUserId="9" LastEditorUserId="5321363" LastEditorDisplayName="Rich B" LastEditDate="2018-05-30T15:55:48.913" LastActivityDate="2018-05-30T15:56:46.080" Title="Determine a User's Timezone" Tags="<javascript><html><browser><timezone><timezoneoffset>" AnswerCount="25" CommentCount="6" FavoriteCount="137" />
</posts>
数据库设计
由于帖子(Post)可以分为两种问题和回答(通过属性PostTypeId的值确定),因为数据量很大(4200万行左右),所以我设计了两张表。
-- PostTypeId = 1
-- Id CreationDate Score ViewCount OwnerUserId Tags AnswerCount FavoriteCount
CREATE TABLE `questions` (
`Id` int,
`CreationDate` datetime,
`Score` int,
`ViewCount` int,
`OwnerUserId` int,
`Tags` varchar(250),
`AnswerCount` int,
`FavoriteCount` int,
PRIMARY KEY (`Id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
-- PostTypeId = 2
--Id ParentId CreationDate Score CommentCount
CREATE TABLE `answers` (
`Id` int,
`ParentId` int,
`CreationDate` datetime,
`Score` int,
`OwnerUserId` int,
`CommentCount` int,
PRIMARY KEY (`Id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
Java代码
import com.mysql.jdbc.PreparedStatement;
import javax.xml.XMLConstants;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.Date;
/**
* Desc: Parse & Import(to mysql db) Stack Overflow dump xml file
* Created by Myth on 10/12/2018
*/
public class XmlProcessor {
private Connection connection = null;
/**
* Get Db Connection
* @throws ClassNotFoundException
* @throws SQLException
*/
public void openMysqlConnection() throws ClassNotFoundException, SQLException {
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/stackoverflow";
String username = "root";
String password = "123456";
Connection connection = null;
Class.forName(driver);
connection = DriverManager.getConnection(url, username, password);
this.connection = connection;
}
public void closeConnection() throws SQLException {
this.connection.close();
}
/**
*
* @param filePath
* @param 每 commitCount 行提交一次
* @throws SQLException
* @throws FileNotFoundException
* @throws XMLStreamException
*/
public void parsePosts(String filePath, int commitCount) throws SQLException, FileNotFoundException, XMLStreamException {
// 计时器 starts
Long begin = new Date().getTime();
// 组合sql语句
String prefixQuestions = "INSERT INTO questions VALUES ";
String prefixAnswers = "INSERT INTO answers VALUES ";
StringBuffer suffixQuestions = new StringBuffer();
StringBuffer suffixAnswers = new StringBuffer();
// 设置事务为非自动提交
this.connection.setAutoCommit(false);
// PreparedStatement 执行 sql语句
PreparedStatement pst = (PreparedStatement) this.connection.prepareStatement("");
// 解析xml获得数据
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
inputFactory.setProperty("http://www.oracle.com/xml/jaxp/properties/getEntityCountInfo", "yes");
// 设置entity size , 否则会报 JAXP00010004 错误
inputFactory.setProperty("http://www.oracle.com/xml/jaxp/properties/totalEntitySizeLimit", Integer.MAX_VALUE);
File file = new File(filePath);
InputStream isS= new FileInputStream(file);
XMLStreamReader streamReader = inputFactory.createXMLStreamReader(isS);
int countRow = 0;
// Q: Id CreationDate Score ViewCount OwnerUserId Tags AnswerCount FavoriteCount
// A: Id ParentId CreationDate Score CommentCount
String id, creationDate, score, viewCount, ownerUserId, tags, answerCount, favoriteCount, parentId, commentCount;
String postTypeId;
String sqlQuestions = null, sqlAnswers = null;
// 存储数据
while(streamReader.hasNext()) {
streamReader.next();
if(streamReader.getEventType() == XMLStreamReader.START_ELEMENT){
if (streamReader.getLocalName().equals("row")) {
postTypeId = streamReader.getAttributeValue(null,"PostTypeId");
id = streamReader.getAttributeValue(null,"Id");
creationDate = streamReader.getAttributeValue(null,"CreationDate");
score = streamReader.getAttributeValue(null,"Score");
viewCount = streamReader.getAttributeValue(null,"ViewCount");
ownerUserId = streamReader.getAttributeValue(null,"OwnerUserId");
tags = streamReader.getAttributeValue(null,"Tags");
answerCount = streamReader.getAttributeValue(null,"AnswerCount");
favoriteCount = streamReader.getAttributeValue(null,"FavoriteCount");
parentId = streamReader.getAttributeValue(null,"ParentId");
commentCount = streamReader.getAttributeValue(null,"CommentCount");
// 1 Question, 2 Answer
if ("1".equals(postTypeId)) {
suffixQuestions.append("(" + id + "," + "\"" + creationDate + "\"" + "," +
score + "," + viewCount + "," + ownerUserId + "," +
"\"" + tags + "\"" + "," + answerCount + "," + favoriteCount + "),");
} else {
suffixAnswers.append("(" + id + "," + parentId + "," + "\"" + creationDate + "\"" + "," +
score + "," + ownerUserId + "," + commentCount + "),");
}
countRow += 1; // 记录行数
if (countRow % commitCount == 0) {
// System.out.print("Count: " + Integer.toString(count));
// 构建完整sql
sqlQuestions = prefixQuestions + suffixQuestions.substring(0, suffixQuestions.length() - 1);
sqlAnswers = prefixAnswers + suffixAnswers.substring(0, suffixAnswers.length() - 1);
// 添加执行sql
pst.addBatch(sqlQuestions);
pst.addBatch(sqlAnswers);
// 执行操作
pst.executeBatch();
// 提交事务
this.connection.commit();
// 清空上一次添加的数据
suffixQuestions = new StringBuffer();
suffixAnswers = new StringBuffer();
System.out.println("Committed: " + countRow + " √");
}
}
}
}
if (suffixQuestions.length() != 0) {
sqlQuestions = prefixQuestions + suffixQuestions.substring(0, suffixQuestions.length() - 1);
pst.addBatch(sqlQuestions);
pst.executeBatch();
connection.commit();
}
if (suffixAnswers.length() != 0) {
sqlAnswers = prefixAnswers + suffixAnswers.substring(0, suffixAnswers.length() - 1);
// System.out.println(suffixAnswers.substring(0, suffixAnswers.length() - 1));
pst.addBatch(sqlAnswers);
pst.executeBatch();
connection.commit();
}
System.out.println("Committed All: " + countRow + " √");
pst.close();
// 耗时
Long end = new Date().getTime();
System.out.println("Cast : " + (end - begin) / 1000 + " s");
}
}
大约需要10多分钟就可以将全部数据(4200多万行)导入。
总结
Bug
Exception in thread "main" javax.xml.stream.XMLStreamException: ParseError at [row,col]:[1077158,4084]
Message: JAXP00010004: The accumulated size of entities is "50,000,001" that exceeded the "50,000,000" limit set by "FEATURE_SECURE_PROCESSING".
原因:java xml解析器自带limit(官方链接),用来控制读入的大小、控制内存..... 因为要处理的xml文件过大,超过了默认的limit,所以在java代码71行,设置了limit的大小,但是即使设置最大,也勉强能够处理4000多万条数据,如果超过这个数量,就不能通过这个方法了,可以将超大的xml切分成几个xml文件,然后按照上述处理。
其他问题
使用batch 插入,可以提高效率
使用 XMLStreamReader 而不用 XMLEventReader(区别见官方文档、教程)提高效率