html 替换body代码,[分享]从HTML中提取TITLE和BODY标签内容的代码

使用了HTML Parser的一个小程序。

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStreamWriter;

import java.io.Writer;

import java.util.ArrayList;

import java.util.List;

import org.apache.commons.collections.ListUtils;

import org.apache.commons.lang.StringUtils;

import org.htmlparser.Node;

import org.htmlparser.Parser;

import org.htmlparser.Remark;

import org.htmlparser.tags.ScriptTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

/**

*

* @version 1.0

* @author 郝春利

*/

public class HtmlContentsFileConverter extends AbstractConverter {

private String inputEncode;

private String outputEncode;

public HtmlContentsFileConverter(String inputEncode, String outputEncode) {

this.inputEncode = inputEncode;

this.outputEncode = outputEncode;

}

public boolean convert(File r, File w) throws IOException {

Parser parser = null;

Writer to = null;

String title = StringUtils.EMPTY;

StringBuilder body = new StringBuilder(1024);

try {

parser = new Parser(r.getPath());

parser.setEncoding(inputEncode);

NodeList rootNode = parser.parse(null);

Node titleNode = getTitleNode(rootNode);

if(titleNode != null){

title = BatchUtil.trimAndEscape(titleNode.toPlainTextString()) + "/n";

}

Node bodyNode = null;

for (Node node : getTargetNodeListByName(rootNode, "body")) {

bodyNode = node;

break;

}

if (bodyNode == null) {

body = new StringBuilder();

}

for (Node node : removeRemark(bodyNode.getChildren()).toNodeArray()) {

body.append(node.toPlainTextString());

}

} catch (ParserException e) {

throw new RuntimeException(e);

}

try {

to = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(w, false), outputEncode));

to.write(title + BatchUtil.trimAndEscape(body.toString()));

} finally {

try {

if (to != null) {

to.close();

}

} catch (IOException e) {

}

}

return true;

}

public Node getTitleNode(NodeList list) {

if (list == null) {

return null;

}

for (Node node : list.toNodeArray()) {

if (node.toString().toLowerCase().startsWith("title")) {

return node;

}

if (node.getChildren() != null && node.getChildren().toNodeArray() != null

&& 0 < node.getChildren().toNodeArray().length) {

return getTitleNode(node.getChildren());

}

}

return null;

}

public List getTargetNodeListByName(NodeList list, String name) {

if (list == null || StringUtils.isBlank(name)) {

return ListUtils.EMPTY_LIST;

}

List resultList = new ArrayList();

for (Node node : list.toNodeArray()) {

if (node.getText().toLowerCase().startsWith(name.toLowerCase())) {

resultList.add(node);

}

if (node.getChildren() != null && node.getChildren().toNodeArray() != null

&& 0 < node.getChildren().toNodeArray().length) {

resultList.addAll(getTargetNodeListByName(node.getChildren(), name));

}

}

return resultList;

}

public NodeList removeRemark(NodeList list) {

if (list == null) {

return list;

}

for (Node node : list.toNodeArray()) {

if (node instanceof Remark || node instanceof ScriptTag) {

list.remove(node);

}

if (node.getChildren() != null && node.getChildren().toNodeArray() != null

&& 0 < node.getChildren().toNodeArray().length) {

removeRemark(node.getChildren());

}

}

return list;

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值