Htmlparser 使用例子

目前根据项目的需要,做了一个类似与网页爬虫的,一个工具,可以互联网上的网站,进行网页解析,分析网页的节点,图片等。

使用Htmlparser解析,使事情变得简单,HTMLParser具有小巧,快速的优点,缺点是相关文档比较少(英文的也少),很多功能需要自己摸索。对于初学者还是要费一些功夫的,而一旦上手以后,会发现HTMLParser的结构设计很巧妙,非常实用,基本你的各种需求都可以满足。

HTMLParser的主页是http://htmlparser.sourceforge.net/ 网站下载

htmlparser.jar、htmllexer.jar、HTMLParser-2.0-SNAPSHOT-src.zip(源码)

这是我做的一个小例子copy到你的IDE下就可以测试下:

import java.net.URL;

import junit.framework.TestCase;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.beans.LinkBean;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.HeadTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.OptionTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.htmlparser.visitors.NodeVisitor;
import org.htmlparser.visitors.ObjectFindingVisitor;

public class ParserTestCase extends TestCase {
	private static final String taokeUrl="http://pindao.huoban.taobao.com/tms/channel/channelcode.htm?pid=mm_17386592_0_0&eventid=101329";
    //private static final Logger logger = Logger.getLogger(ParserTestCase.class);

    public ParserTestCase(String name) {
        super(name);
    }
    /*
     * 测试ObjectFindVisitor的用法
     */
    public void testImageVisitor() {
        try {
            ImageTag imgLink;
            ObjectFindingVisitor visitor = new ObjectFindingVisitor(
                    ImageTag.class);
            Parser parser = new Parser();
            parser.setURL("http://www.baidu.com");
            parser.setEncoding(parser.getEncoding());
            parser.visitAllNodesWith(visitor);
            Node[] nodes = visitor.getTags();
            for (int i = 0; i < nodes.length; i++) {
                imgLink = (ImageTag) nodes[i];
                System.out.println("testImageVisitor() ImageURL = "
                        + imgLink.getImageURL());
                System.out.println("testImageVisitor() ImageLocation = "
                        + imgLink.extractImageLocn());
                System.out.println("testImageVisitor() SRC = "
                        + imgLink.getAttribute("SRC"));
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
    /*
     * 测试TagNameFilter用法
     */
    public void testNodeFilter() {
        try {
        	NodeFilter filter = new TagNameFilter("a");
            Parser parser = new Parser();
            parser.setURL("http://www.baidu.com");
            parser.setEncoding(parser.getEncoding());
            NodeList list = parser.extractAllNodesThatMatch(filter);
            for (int i = 0; i < list.size(); i++) {
                System.out.println("testNodeFilter() " + list.elementAt(i).toHtml());
                System.out.println("testNodeFilter-text: " + list.elementAt(i).getFirstChild().toHtml());
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
    /*
     * 测试NodeClassFilter用法
     */
    public void testLinkTag() {
        try {

            NodeFilter filter = new NodeClassFilter(LinkTag.class);
            Parser parser = new Parser();
            parser.setURL(taokeUrl);
            parser.setEncoding(parser.getEncoding());
            NodeList list = parser.extractAllNodesThatMatch(filter);
            for (int i = 0; i < list.size(); i++) {
                LinkTag node = (LinkTag) list.elementAt(i);
                System.out.println("testLinkTag() getLinkText is :" + node.getChildrenHTML());
              
                System.out.println("testLinkTag() Link is :" + node.extractLink());
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
    /*
     * 测试<link href=" text=’text/css’ rel=’stylesheet’ />用法
     */
    public void testLinkCSS() {
        try {

            Parser parser = new Parser();
            parser
                    .setInputHTML("<head><title>Link Test</title>"
                            + "<link href=’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"
                            + "<link href=’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"
                            + "</head>" + "<body>");
            parser.setEncoding(parser.getEncoding());
            NodeList nodeList = null;

            for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
                Node node = e.nextNode();
                System.out.println("testLinkCSS()" + node.getText()
                                + node.getClass());

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    /**//*
     * 测试OrFilter的用法
     */
    public void testOrFilter() {
        NodeFilter inputFilter = new NodeClassFilter(InputTag.class);
        NodeFilter selectFilter = new NodeClassFilter(SelectTag.class); 
        NodeList nodeList = null; 
        try {
            Parser parser = new Parser();
            parser .setInputHTML("<head><title>OrFilter Test</title>"
                            + "<link href=’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"
                            + "<link href=’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"
                            + "</head>"
                            + "<body>"
                            + "<input type=’text’ value=’text1′ name=’text1′/>"
                            + "<input type=’text’ value=’text2′ name=’text2′/>"
                            + "<select><option id=’1′>1</option><option id=’2′>2</option><option id=’3′></option></select>"
                            + "<a href=’http://www.yeeach.com’>yeeach.com</a>"
                            + "</body>"); 
            parser.setEncoding(parser.getEncoding());
            OrFilter lastFilter = new OrFilter();
            lastFilter.setPredicates(new NodeFilter[] { selectFilter,
                    inputFilter });
            nodeList = parser.parse(lastFilter);
            for (int i = 0; i <= nodeList.size(); i++) {
                if (nodeList.elementAt(i) instanceof InputTag) {
                    InputTag tag = (InputTag) nodeList.elementAt(i);
                   System.out.println("OrFilter tag name is :" + tag.getTagName()
                            +" ,tag value is:" + tag.getAttribute("value"));
                }
                if (nodeList.elementAt(i) instanceof SelectTag) {
                    SelectTag tag = (SelectTag) nodeList.elementAt(i);
                    NodeList list = tag.getChildren(); 
                    for (int j = 0; j < list.size(); j++) {
                        OptionTag option = (OptionTag) list.elementAt(j);
                       System.out.println("OrFilter Option"
                                        + option.getOptionText());
                    } 
                }
            } 
        } catch (ParserException e) {
            e.printStackTrace();
        }
    }
    /**//*
     * 测试对<table><tr><td></td></tr></table>的解析
     */
    public void testTable() {
        Parser myParser;
        NodeList nodeList = null;
        myParser = Parser.createParser("<body>" + "<table id=’table1′ >"
                + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
                + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
                + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"
                + "<table id=’table2′ >"
                + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
                + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
                + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"
                + "</body>", "GBK");
        NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
        OrFilter lastFilter = new OrFilter();
        lastFilter.setPredicates(new NodeFilter[] { tableFilter });
        try {
            nodeList = myParser.parse(lastFilter);
            for (int i = 0; i <= nodeList.size(); i++) {
                if (nodeList.elementAt(i) instanceof TableTag) {
                    TableTag tag = (TableTag) nodeList.elementAt(i);
                    TableRow[] rows = tag.getRows(); 
                    for (int j = 0; j < rows.length; j++) {
                        TableRow tr = (TableRow) rows[j]; 
                        TableColumn[] td = tr.getColumns();
                        for (int k = 0; k < td.length; k++) {
                            System.out.println("<td>" + td[k].toPlainTextString());
                        } 
                    } 
//                    System.out.println(nodeList.elementAt(i)+ " "+ i);
                }
            } 
        } catch (ParserException e) {
            e.printStackTrace();
        }
    }
    /**//*
     * 测试NodeVisitor的用法,遍历所有节点
     */
    public void testVisitorAll() {
        try {
            Parser parser = new Parser();
            parser.setURL("http://www.baidu.com");
            parser.setEncoding(parser.getEncoding());
            NodeVisitor visitor = new NodeVisitor() {
                public void visitTag(Tag tag) {
                   System.out.println("testVisitorAll()  Tag name is :"
                            + tag.getTagName() + " /n Class is :"
                            + tag.getClass());
                } 
            }; 
            parser.visitAllNodesWith(visitor);
        } catch (ParserException e) {
            e.printStackTrace();
        }
    }
    /**//*
     * 测试对指定Tag的NodeVisitor的用法
     */
    public void testTagVisitor() {
        try { 
            Parser parser = new Parser(
                    "<head><title>dddd</title>"
                            + "<link href=’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"
                            + "<link href=’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"
                            + "</head>" + "<body>"
                            + "<a href=’http://www.yeeach.com’>yeeach.com</a>"
                            + "</body>");
            NodeVisitor visitor = new NodeVisitor() {
                public void visitTag(Tag tag) {
                    if (tag instanceof HeadTag) {
                       System.out.println("visitTag() HeadTag : Tag name is :"
                                + tag.getTagName() + " /n Class is :"
                                + tag.getClass() + "/n Text is :"
                                + tag.getText());
                    } else if (tag instanceof TitleTag) {
                       System.out.println("visitTag() TitleTag : Tag name is :"
                                + tag.getTagName() + " /n Class is :"
                                + tag.getClass() + "/n Text is :"
                                + tag.getText()); 
                    } else if (tag instanceof LinkTag) {
                       System.out.println("visitTag() LinkTag : Tag name is :"
                                + tag.getTagName() + " /n Class is :"
                                + tag.getClass() + "/n Text is :"
                                + tag.getText() + " /n getAttribute is :"
                                + tag.getAttribute("href"));
                    } else {
                       System.out.println("visitTag() : Tag name is :"
                                + tag.getTagName() + " /n Class is :"
                                + tag.getClass() + "/n Text is :"
                                + tag.getText());
                    } 
                } 
            }; 
            parser.visitAllNodesWith(visitor);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
  //测试HtmlPage的用法, 遍历节点

    public void testHtmlPage() {
       Parser parser = null;
       HtmlPage htmlPage = null;
       NodeList list = null;
       try {
           parser = new Parser();
           String inputHTML = "<html>" + "<head>" +
                     "<title>Welcome to the HTMLParser website</title>" +
                     "</head><body>Welcome to HTMLParser" +
                     "<table id=’table1′ >" +
                     "<tr><td>1-11</td><td>1-12</td><td>1-13</td>" +
                     "<tr><td>1-21</td><td>1-22</td><td>1-23</td>" +
                     "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" +
                     "<table id=’table2′ >" +
                     "<tr><td>2-11</td><td>2-12</td><td>2-13</td>" +
                     "<tr><td>2-21</td><td>2-22</td><td>2-23</td>" +
                     "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" +
                     "</body></html>";

           parser.setInputHTML(inputHTML);
           htmlPage = new HtmlPage(parser);
           parser.visitAllNodesWith(htmlPage);
           System.out.println("Title:" + htmlPage.getTitle());
           
           list = htmlPage.getBody();

           for (NodeIterator iterator=list.elements(); iterator.hasMoreNodes();) {
              Node node = iterator.nextNode();
              System.out.println(node.toHtml());
           }

           TableTag[] tables = htmlPage.getTables();

           for (int i=0; i<tables.length; i++) {
              TableRow[] rows = tables[i].getRows();
              for (int r=0; r<rows.length; r++) {
                  TableColumn[] cols = rows[r].getColumns();
                  for (int c=0; c<cols.length; c++) {
                     System.out.print(cols[c].toPlainTextString() + " ");
                  }
                  System.out.println();
              }
           }

       } catch (ParserException e) {
           e.printStackTrace();
       }
    }

    /*
     * 测试LinkBean的用法
     */
     public void testLinkBean() {
         Parser parser = new Parser();
     
         LinkBean linkBean = new LinkBean();
         linkBean.setURL("http://www.baidu.com");
         URL[] urls = linkBean.getLinks();
     
         for (int i = 0; i < urls.length; i++) {
             URL url = urls[i];
             System.out.println("testLinkBean() -url is :" + url);
         }

     }

    // 又新写了两个测试方法
     /*
     * 测试DIV用法
     */
     public void testDivCSS() {
         try {
             Parser parser = new Parser();
             parser
             .setInputHTML("<html><head><title>Link Test</title>"
             + "<link href=http://www.yeeach.com/’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"
             + "<link href=http://www.yeeach.com/’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"
             + "</head><body>" 
             + "<div id=AA>dafafda</div>"
             +"<div id=A2>CCC</div>"
             +"</body></html>");
             NodeFilter textFilter = new NodeClassFilter(Div.class);
             OrFilter lastFilter = new OrFilter();
             lastFilter.setPredicates(new NodeFilter[] { textFilter });
             NodeList nodeList  = parser.parse(lastFilter);

             for (int i=0;i<nodeList.size();i++) {
                 Node node = nodeList.elementAt(i);
                 Div  div=(Div)node;
                 Tag a=null;
                 System.out.println("my--->" + node.getText()+node.toHtml()+node.toPlainTextString());
             }
         } catch (Exception e) {
             e.printStackTrace();
         }
     }
     /**
      * 获取A里面的内容
      */
     public void testAincludeImg(){
         try{
             Parser parser=new Parser();
             parser.setInputHTML("<html><head><title>Link Test</title></head><body><a href=http://wpa.qq.com/msgrd?V=1&amp;Uin=410145132&amp;Site=华奥星空论坛&amp;Menu=yes target=’_blank’>"
                     +"<img src=’http://wpa.qq.com/pa?p=1:410145132:4′  border=’0′ alt=’QQ’ />410145132</a></body></html>") ;
             NodeFilter textFilter = new NodeClassFilter(LinkTag.class);
             OrFilter lastFilter = new OrFilter();
             lastFilter.setPredicates(new NodeFilter[] { textFilter });
             NodeList nodeList  = parser.parse(lastFilter);
             for (int i=0;i<nodeList.size();i++) {
                 Node node = nodeList.elementAt(i);
                 LinkTag  div=(LinkTag)node;
                 System.out.println("my--->" +node.toPlainTextString());
             }
         }catch(Exception e){
             e.printStackTrace();
         }
     }
     /**
      * 
      * 以StartWith的方式构建过虑器
      */
     private NodeFilter createStartWithFilter(final String filterStr){
         NodeFilter myFilter = new NodeFilter(){ //自定义过虑器
             public boolean accept(Node node) {
                 if(node.getText().startsWith(filterStr)){
                     return true;
                 }else{
                     return false;
                 }
             }
         };
         return myFilter;
     }
     /**
      * 以endWith的方式构建过虑器
      */
     private NodeFilter createEndWithFilter(final String filterStr){
         NodeFilter myFilter = new NodeFilter(){ //自定义过虑器
             public boolean accept(Node node) {
                 if(node.getText().endsWith(filterStr)){
                     return true;
                 }else{
                     return false;
                 }
             }
         };
         return myFilter;
     }

}

----------------------------------------------------------------------------------------

随便,使用一下:bing 地图.呵呵

地图图片
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
按DOM模型解析html文件的工具包 已下是源码列表: META-INF/MANIFEST.MF META-INF/maven/org.htmlparser/htmlparser/pom.properties META-INF/maven/org.htmlparser/htmlparser/pom.xml org.htmlparser.Parser.class org.htmlparser.PrototypicalNodeFactory.class org.htmlparser.beans.BeanyBaby.class org.htmlparser.beans.FilterBean.class org.htmlparser.beans.HTMLLinkBean.class org.htmlparser.beans.HTMLTextBean.class org.htmlparser.beans.LinkBean.class org.htmlparser.beans.StringBean.class org.htmlparser.filters.AndFilter.class org.htmlparser.filters.CssSelectorNodeFilter.class org.htmlparser.filters.HasAttributeFilter.class org.htmlparser.filters.HasChildFilter.class org.htmlparser.filters.HasParentFilter.class org.htmlparser.filters.HasSiblingFilter.class org.htmlparser.filters.IsEqualFilter.class org.htmlparser.filters.LinkRegexFilter.class org.htmlparser.filters.LinkStringFilter.class org.htmlparser.filters.NodeClassFilter.class org.htmlparser.filters.NotFilter.class org.htmlparser.filters.OrFilter.class org.htmlparser.filters.RegexFilter.class org.htmlparser.filters.StringFilter.class org.htmlparser.filters.TagNameFilter.class org.htmlparser.http.HttpHeader.class org.htmlparser.sax.Attributes.class org.htmlparser.sax.Feedback.class org.htmlparser.sax.Locator.class org.htmlparser.sax.XMLReader.class org.htmlparser.scanners.CompositeTagScanner.class org.htmlparser.scanners.JspScanner.class org.htmlparser.scanners.ScriptDecoder.class org.htmlparser.scanners.ScriptScanner.class org.htmlparser.scanners.StyleScanner.class org.htmlparser.tags.AppletTag.class org.htmlparser.tags.BaseHrefTag.class org.htmlparser.tags.BlockquoteTag.class org.htmlparser.tags.BodyTag.class org.htmlparser.tags.Bullet.class org.htmlparser.tags.BulletList.class org.htmlparser.tags.CompositeTag.class org.htmlparser.tags.DefinitionList.class org.htmlparser.tags.DefinitionListBullet.class org.htmlparser.tags.Div.class org.htmlparser.tags.DoctypeTag.class org.htmlparser.tags.FormTag.class org.htmlparser.tags.FrameSetTag.class org.htmlparser.tags.FrameTag.class org.htmlparser.tags.HeadTag.class org.htmlparser.tags.HeadingTag.class org.htmlparser.tags.Html.class org.htmlparser.tags.ImageTag.class org.htmlparser.tags.InputTag.class org.htmlparser.tags.JspTag.class org.htmlparser.tags.LabelTag.class org.htmlparser.tags.LinkTag.class org.htmlparser.tags.MetaTag.class org.htmlparser.tags.ObjectTag.class org.htmlparser.tags.OptionTag.class org.htmlparser.tags.ParagraphTag.class org.htmlparser.tags.ProcessingInstructionTag.class org.htmlparser.tags.ScriptTag.class org.htmlparser.tags.SelectTag.class org.htmlparser.tags.Span.class org.htmlparser.tags.StyleTag.class org.htmlparser.tags.TableColumn.class org.htmlparser.tags.TableHeader.class org.htmlparser.tags.TableRow.class org.htmlparser.tags.TableTag.class org.htmlparser.tags.TextareaTag.class org.htmlparser.tags.TitleTag.class org.htmlparser.util.CharacterReference.class org.htmlparser.util.CharacterReferenceEx.class org.htmlparser.util.DefaultParserFeedback.class org.htmlparser.util.FeedbackManager.class org.htmlparser.util.IteratorImpl.class org.htmlparser.util.NodeTreeWalker.class org.htmlparser.util.ParserFeedback.class org.htmlparser.util.ParserUtils.class org.htmlparser.util.Translate.class org.htmlparser.visitors.HtmlPage.class org.htmlparser.visitors.LinkFindingVisitor.class org.htmlparser.visitors.ObjectFindingVisitor.class org.htmlparser.visitors.StringFindingVisitor.class org.htmlparser.visitors.TagFindingVisitor.class org.htmlparser.visitors.TextExtractingVisitor.class org.htmlparser.visitors.UrlModifyingVisitor.class org/htmlparser/beans/images/Chain16.gif org/htmlparser/beans/images/Chain32.gif org/htmlparser/beans/images/Knot16.gif org/htmlparser/beans/images/Knot32.gif

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值