使用了HTML Parser的一个小程序。
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.collections.ListUtils;
import org.apache.commons.lang.StringUtils;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Remark;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
*
* @version 1.0
* @author 郝春利
*/
public class HtmlContentsFileConverter extends AbstractConverter {
private String inputEncode;
private String outputEncode;
public HtmlContentsFileConverter(String inputEncode, String outputEncode) {
this.inputEncode = inputEncode;
this.outputEncode = outputEncode;
}
public boolean convert(File r, File w) throws IOException {
Parser parser = null;
Writer to = null;
String title = StringUtils.EMPTY;
StringBuilder body = new StringBuilder(1024);
try {
parser = new Parser(r.getPath());
parser.setEncoding(inputEncode);
NodeList rootNode = parser.parse(null);
Node titleNode = getTitleNode(rootNode);
if(titleNode != null){
title = BatchUtil.trimAndEscape(titleNode.toPlainTextString()) + "/n";
}
Node bodyNode = null;
for (Node node : getTargetNodeListByName(rootNode, "body")) {
bodyNode = node;
break;
}
if (bodyNode == null) {
body = new StringBuilder();
}
for (Node node : removeRemark(bodyNode.getChildren()).toNodeArray()) {
body.append(node.toPlainTextString());
}
} catch (ParserException e) {
throw new RuntimeException(e);
}
try {
to = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(w, false), outputEncode));
to.write(title + BatchUtil.trimAndEscape(body.toString()));
} finally {
try {
if (to != null) {
to.close();
}
} catch (IOException e) {
}
}
return true;
}
public Node getTitleNode(NodeList list) {
if (list == null) {
return null;
}
for (Node node : list.toNodeArray()) {
if (node.toString().toLowerCase().startsWith("title")) {
return node;
}
if (node.getChildren() != null && node.getChildren().toNodeArray() != null
&& 0 < node.getChildren().toNodeArray().length) {
return getTitleNode(node.getChildren());
}
}
return null;
}
public List getTargetNodeListByName(NodeList list, String name) {
if (list == null || StringUtils.isBlank(name)) {
return ListUtils.EMPTY_LIST;
}
List resultList = new ArrayList();
for (Node node : list.toNodeArray()) {
if (node.getText().toLowerCase().startsWith(name.toLowerCase())) {
resultList.add(node);
}
if (node.getChildren() != null && node.getChildren().toNodeArray() != null
&& 0 < node.getChildren().toNodeArray().length) {
resultList.addAll(getTargetNodeListByName(node.getChildren(), name));
}
}
return resultList;
}
public NodeList removeRemark(NodeList list) {
if (list == null) {
return list;
}
for (Node node : list.toNodeArray()) {
if (node instanceof Remark || node instanceof ScriptTag) {
list.remove(node);
}
if (node.getChildren() != null && node.getChildren().toNodeArray() != null
&& 0 < node.getChildren().toNodeArray().length) {
removeRemark(node.getChildren());
}
}
return list;
}