使用了HTML Parser的一个小程序。import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; import java.util.List; import org.apache.commons.collections.ListUtils; import org.apache.commons.lang.StringUtils; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.Remark; import org.htmlparser.tags.ScriptTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * * @version 1.0 * @author 郝春利 */ public class HtmlContentsFileConverter extends AbstractConverter { private String inputEncode; private String outputEncode; public HtmlContentsFileConverter(String inputEncode, String outputEncode) { this.inputEncode = inputEncode; this.outputEncode = outputEncode; } public boolean convert(File r, File w) throws IOException { Parser parser = null; Writer to = null; String title = StringUtils.EMPTY; StringBuilder body = new StringBuilder(1024); try { parser = new Parser(r.getPath()); parser.setEncoding(inputEncode); NodeList rootNode = parser.parse(null); Node titleNode = getTitleNode(rootNode); if(titleNode != null){ title = BatchUtil.trimAndEscape(titleNode.toPlainTextString()) + "/n"; } Node bodyNode = null; for (Node node : getTargetNodeListByName(rootNode, "body")) { bodyNode = node; break; } if (bodyNode == null) { body = new StringBuilder(); } for (Node node : removeRemark(bodyNode.getChildren()).toNodeArray()) { body.append(node.toPlainTextString()); } } catch (ParserException e) { throw new RuntimeException(e); } try { to = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(w, false), outputEncode)); to.write(title + BatchUtil.trimAndEscape(body.toString())); } finally { try { if (to != null) { to.close(); } } catch (IOException e) { } } return true; } public Node getTitleNode(NodeList list) { if (list == null) { return null; } for (Node node : list.toNodeArray()) { if (node.toString().toLowerCase().startsWith("title")) { return node; } if (node.getChildren() != null && node.getChildren().toNodeArray() != null && 0 < node.getChildren().toNodeArray().length) { return getTitleNode(node.getChildren()); } } return null; } public List<Node> getTargetNodeListByName(NodeList list, String name) { if (list == null || StringUtils.isBlank(name)) { return ListUtils.EMPTY_LIST; } List<Node> resultList = new ArrayList<Node>(); for (Node node : list.toNodeArray()) { if (node.getText().toLowerCase().startsWith(name.toLowerCase())) { resultList.add(node); } if (node.getChildren() != null && node.getChildren().toNodeArray() != null && 0 < node.getChildren().toNodeArray().length) { resultList.addAll(getTargetNodeListByName(node.getChildren(), name)); } } return resultList; } public NodeList removeRemark(NodeList list) { if (list == null) { return list; } for (Node node : list.toNodeArray()) { if (node instanceof Remark || node instanceof ScriptTag) { list.remove(node); } if (node.getChildren() != null && node.getChildren().toNodeArray() != null && 0 < node.getChildren().toNodeArray().length) { removeRemark(node.getChildren()); } } return list; }