用htmlparser实现的得到一个网页的纯文本内容,代码如下:
public static String getPlainTextFromHTML(String inputHtml){
StringBuffer text = new StringBuffer();
Parser parser=null;
parser = Parser.createParser(inputHtml,"UTF-8");
// 遍历所有的节点
NodeList nodes=null;
try {
nodes = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
return true;
}
});
} catch (ParserException e) {
e.printStackTrace();
}
for(int i=0;i<nodes.size();i++)
{
Node node = nodes.elementAt(i);
text.append(node.toPlainTextString());
}
return text.toString();
}