java 解析超大xml_如何在Java中解析大(50 GB)XML文件

目前,我正在尝试使用SAX解析器,但是通过文件它大约3/4完全冻结了,我尝试分配更多的内存等,但没有得到任何改善。

有什么办法可以加快速度吗?更好的方法?

剥开它的骨头,所以我现在有了以下代码,并且在命令行中运行时,它的运行速度还没有达到我想要的速度。

使用“ java -Xms-4096m -Xmx8192m -jar reader.jar”运行它,得到的GC开销限制超出了文章700000

主要:

public class Read {

public static void main(String[] args) {

pages = XMLManager.getPages();

}

}

XML管理器

public class XMLManager {

public static ArrayList getPages() {

ArrayList pages = null;

SAXParserFactory factory = SAXParserFactory.newInstance();

try {

SAXParser parser = factory.newSAXParser();

File file = new File("..\\enwiki-20140811-pages-articles.xml");

PageHandler pageHandler = new PageHandler();

parser.parse(file, pageHandler);

pages = pageHandler.getPages();

} catch (ParserConfigurationException e) {

e.printStackTrace();

} catch (SAXException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

return pages;

}

}

页面处理程序

public class PageHandler extends DefaultHandler{

private ArrayList pages = new ArrayList<>();

private Page page;

private StringBuilder stringBuilder;

private boolean idSet = false;

public PageHandler(){

super();

}

@Override

public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {

stringBuilder = new StringBuilder();

if (qName.equals("page")){

page = new Page();

idSet = false;

} else if (qName.equals("redirect")){

if (page != null){

page.setRedirecting(true);

}

}

}

@Override

public void endElement(String uri, String localName, String qName) throws SAXException {

if (page != null && !page.isRedirecting()){

if (qName.equals("title")){

page.setTitle(stringBuilder.toString());

} else if (qName.equals("id")){

if (!idSet){

page.setId(Integer.parseInt(stringBuilder.toString()));

idSet = true;

}

} else if (qName.equals("text")){

String articleText = stringBuilder.toString();

articleText = articleText.replaceAll("(?s)", " "); //remove references

articleText = articleText.replaceAll("(?s)\\{\\{(.+?)\\}\\}", " "); //remove links underneath headings

articleText = articleText.replaceAll("(?s)==See also==.+", " "); //remove everything after see also

articleText = articleText.replaceAll("\\|", " "); //Separate multiple links

articleText = articleText.replaceAll("\\n", " "); //remove new lines

articleText = articleText.replaceAll("[^a-zA-Z0-9- \\s]", " "); //remove all non alphanumeric except dashes and spaces

articleText = articleText.trim().replaceAll(" +", " "); //convert all multiple spaces to 1 space

Pattern pattern = Pattern.compile("([\\S]+\\s*){1,75}"); //get first 75 words of text

Matcher matcher = pattern.matcher(articleText);

matcher.find();

try {

page.setSummaryText(matcher.group());

} catch (IllegalStateException se){

page.setSummaryText("None");

}

page.setText(articleText);

} else if (qName.equals("page")){

pages.add(page);

page = null;

}

} else {

page = null;

}

}

@Override

public void characters(char[] ch, int start, int length) throws SAXException {

stringBuilder.append(ch,start, length);

}

public ArrayList getPages() {

return pages;

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值