package function.htmlparser;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasParentFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class Test {
public void listAll(Parser parser){
try {
NodeIterator nodeIterator=parser.elements();
while (nodeIterator.hasMoreNodes()){
System.out.println("+++++++++++++++++++++");
Node node=nodeIterator.nextNode();
System.out.println("getText():"+node.getText());
System.out.println("getHtml():"+node.toHtml());
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void filter(Parser parser){
NodeList nodelist;
// NodeFilter filterL = new TagNameFilter("a");
NodeFilter filterS = new HasAttributeFilter("class","post-title");
NodeFilter filterP= new HasParentFilter(filterS);
try {
nodelist=parser.parse(filterP);
//Node node=nodelist.elementAft(0);
// NodeFilter haf= new HasAttributeFilter("class","post-title");
// 获取相应的节点
nodelist=nodelist.extractAllNodesThatMatch(filterP,true);
for(int i=0;i<nodelist.size();i++){
LinkTag link=(LinkTag)nodelist.elementAt(i).getFirstChild();
System.out.println(link.getAttribute("href")+"/n");
System.out.println(link.getStringText());
// System.out.println(nodelist.elementAt(i).getFirstChild().getText()+"-----"+nodelist.elementAt(i).getFirstChild().toHtml());
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
String urlStr="http://localhost:8080/tomfish88/error.jsp";
Parser parser=new Parser();
try {
parser.setURL(urlStr);
parser.setEncoding("gb2312");
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Test test=new Test();
test.filter(parser);
}
}
html文件
<%@ page language="java" contentType="text/html; charset=GB18030"
pageEncoding="GB18030"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=GB18030">
<title>Insert title here</title>
</head>
<body>
error!!!!!!
<table>
<tr><td>td-c1</td></tr>
<tr class="post-title"><td><a href="http://www.fsd.com">连接1</a></td></tr>
<tr><td>td-cc1 <a href="http://www.fsd44444.com">连接3</a> </td></tr>
<tr class="post-title"><td><a href="http://www.fsd222222.com">连接2</a></td></tr>
</table>
</body>
</html>
java文件