java xml 大文件怎么打开_如何用Java解析大型(50 GB)XML文件

目前,我试图使用一个SAX解析器,但约3/4通过文件,它完全冻结,我已经尝试分配更多的内存等,但没有得到任何改进.

有什么办法加速吗?一个更好的方法?

剥去它的裸骨头,所以我现在有以下代码,当在命令行运行它仍然不会像我想要的那么快.

运行它“java -Xms-4096m -Xmx8192m -jar reader.jar”我得到一个GC超出限制超过了约700000

主要:

public class Read {

public static void main(String[] args) {

pages = XMLManager.getPages();

}

}

XMLManager

public class XMLManager {

public static ArrayList getPages() {

ArrayList pages = null;

SAXParserFactory factory = SAXParserFactory.newInstance();

try {

SAXParser parser = factory.newSAXParser();

File file = new File("..\\enwiki-20140811-pages-articles.xml");

PageHandler pageHandler = new PageHandler();

parser.parse(file, pageHandler);

pages = pageHandler.getPages();

} catch (ParserConfigurationException e) {

e.printStackTrace();

} catch (SAXException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

return pages;

}

}

页面处理器

public class PageHandler extends DefaultHandler{

private ArrayList pages = new ArrayList<>();

private Page page;

private StringBuilder stringBuilder;

private boolean idSet = false;

public PageHandler(){

super();

}

@Override

public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {

stringBuilder = new StringBuilder();

if (qName.equals("page")){

page = new Page();

idSet = false;

} else if (qName.equals("redirect")){

if (page != null){

page.setRedirecting(true);

}

}

}

@Override

public void endElement(String uri, String localName, String qName) throws SAXException {

if (page != null && !page.isRedirecting()){

if (qName.equals("title")){

page.setTitle(stringBuilder.toString());

} else if (qName.equals("id")){

if (!idSet){

page.setId(Integer.parseInt(stringBuilder.toString()));

idSet = true;

}

} else if (qName.equals("text")){

String articleText = stringBuilder.toString();

articleText = articleText.replaceAll("(?s)", " "); //remove references

articleText = articleText.replaceAll("(?s)\\{\\{(.+?)\\}\\}", " "); //remove links underneath headings

articleText = articleText.replaceAll("(?s)==See also==.+", " "); //remove everything after see also

articleText = articleText.replaceAll("\\|", " "); //Separate multiple links

articleText = articleText.replaceAll("\\n", " "); //remove new lines

articleText = articleText.replaceAll("[^a-zA-Z0-9- \\s]", " "); //remove all non alphanumeric except dashes and spaces

articleText = articleText.trim().replaceAll(" +", " "); //convert all multiple spaces to 1 space

Pattern pattern = Pattern.compile("([\\S]+\\s*){1,75}"); //get first 75 words of text

Matcher matcher = pattern.matcher(articleText);

matcher.find();

try {

page.setSummaryText(matcher.group());

} catch (IllegalStateException se){

page.setSummaryText("None");

}

page.setText(articleText);

} else if (qName.equals("page")){

pages.add(page);

page = null;

}

} else {

page = null;

}

}

@Override

public void characters(char[] ch, int start, int length) throws SAXException {

stringBuilder.append(ch,start, length);

}

public ArrayList getPages() {

return pages;

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值