- package function.htmlparser;
- import org.htmlparser.Node;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.AndFilter;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.HasParentFilter;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.util.NodeIterator;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- public class Test {
- public void listAll(Parser parser){
- try {
- NodeIterator nodeIterator=parser.elements();
- while (nodeIterator.hasMoreNodes()){
- System.out.println("+++++++++++++++++++++");
- Node node=nodeIterator.nextNode();
- System.out.println("getText():"+node.getText());
- System.out.println("getHtml():"+node.toHtml());
- }
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- public void filter(Parser parser){
- NodeList nodelist;
- // NodeFilter filterL = new TagNameFilter("a");
- NodeFilter filterS = new HasAttributeFilter("class","post-title");
- NodeFilter filterP= new HasParentFilter(filterS);
- try {
- nodelist=parser.parse(filterP);
- //Node node=nodelist.elementAft(0);
- // NodeFilter haf= new HasAttributeFilter("class","post-title");
- // 获取相应的节点
- nodelist=nodelist.extractAllNodesThatMatch(filterP,true);
- for(int i=0;i<nodelist.size();i++){
- LinkTag link=(LinkTag)nodelist.elementAt(i).getFirstChild();
- System.out.println(link.getAttribute("href")+"/n");
- System.out.println(link.getStringText());
- // System.out.println(nodelist.elementAt(i).getFirstChild().getText()+"-----"+nodelist.elementAt(i).getFirstChild().toHtml());
- }
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- public static void main(String[] args) {
- String urlStr="http://localhost:8080/tomfish88/error.jsp";
- Parser parser=new Parser();
- try {
- parser.setURL(urlStr);
- parser.setEncoding("gb2312");
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- Test test=new Test();
- test.filter(parser);
- }
- }
html文件
- <%@ page language="java" contentType="text/html; charset=GB18030"
- pageEncoding="GB18030"%>
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
- <html>
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=GB18030">
- <title>Insert title here</title>
- </head>
- <body>
- error!!!!!!
- <table>
- <tr><td>td-c1</td></tr>
- <tr class="post-title"><td><a href="http://www.fsd.com">连接1</a></td></tr>
- <tr><td>td-cc1 <a href="http://www.fsd44444.com">连接3</a> </td></tr>
- <tr class="post-title"><td><a href="http://www.fsd222222.com">连接2</a></td></tr>
- </table>
- </body>
- </html>
java文件