HTMLParser的功能及实现之Filter模式

Filter模式可以通过import org.htmlparser.filters.*;以及import org.htmlparser.NodeFilter;来实现接口

https://svn.sourceforge.net/svnroot/htmlparser/trunk/lexer/src/main/java/org/htmlparser/NodeFilter.java测试类从sina主页上面提取所有含有图片的连接

package  my.htmlparser;

import  java.io. * ;
// import org.htmlparser.*;      // Parser
import  org.htmlparser.Parser;
import  org.htmlparser.NodeFilter;
import  org.htmlparser.Node;
import  org.htmlparser.http. * ; // ConnectionManager需要引入的类
import  org.htmlparser.util.NodeList;
import  org.htmlparser.util.NodeIterator;
import  org.htmlparser.lexer.Page;
import  org.htmlparser.filters. * ;

public   class  FilterTest
{
    
public   static   void  main(String[] args) 
    {
        ConnectionManager manager;
        
try
        {
            manager
= Page.getConnectionManager();
            Parser parser
= new  Parser(manager.openConnection( " http://www.sina.com.cn " ));
            
// Parser parser=new Parser(" http://www.sina.com.cn ");
            parser.setEncoding( " GB2312 " );
            
// 过滤得到“a"标签并且含有子标签”img“
            NodeFilter filter = new  AndFilter( new  TagNameFilter( " a " ),
                    
new  HasChildFilter( new  TagNameFilter( " img " )));
            NodeList nodeList
= parser.parse(filter);
            
// 迭代逐渐查找
            NodeIterator it = nodeList.elements();
            
while (it.hasMoreNodes())
            {
                Node node
= (Node)it.nextNode();
                System.out.println(node.toHtml());
            }
        }
        
catch (Exception e)
        {
            e.printStackTrace();
        }
    }

}
程序运行结果如下:
< a href = " http://cl.sina.com.cn/ " >< img src = " http://i1.sinaimg.cn/home/deco/2008/0329/sinahome_ws_037.gif "  width = " 27 "  height = " 15 "  alt = ""  style = " position:absolute;top:-2px;left:6px; "   /></ a >
< A HREF = " http://44.adsina.allyes.com/main/adfclick?user=AFP6_for_SINA|Home|HomePV&db=sina " >< IMG SRC = " http://44.adsina.allyes.com/main/adfshow?user=AFP6_for_SINA|Home|HomePV&db=sina "  WIDTH = 1  HEIGHT = 1  BORDER = 0 ></ a >
< a href = " http://sina.allyes.com/main/adfclick?db=sina&bid=66825,97935,98029&cid=0,0,0&sid=90934&advid=1832&camid=13706&show=ignore&url=http://61.49.38.5/tzhddoc/index.htm "  target = " _blank " >< img src = " http://i1.sinaimg.cn/home/deco/2008/0325/tmp001.gif "  width = " 120 "  height = " 20 "  alt = " 投资海淀 " /></ a >
< a href = " javascript:submitFormWithChannel('hplogo') " >< img src = " http://i1.sinaimg.cn/home/07index/home_google_logo_070702.gif "  width = " 94 "  height = " 23 "  alt = " Google " /></ a >
< a href = " http://sina.allyes.com/main/adfclick?db=sina&bid=113516,147454,152396&cid=0,0,0&sid=139544&advid=1039&camid=18960&show=ignore&url=http://my2008.sina.com.cn "  target = " _blank " >< img src = " http://i0.sinaimg.cn/dy/deco/2007/1119/my2008_title.jpg "  width = " 120 "  height = " 20 "  border = " 0 " ></ a >
< a href = " http://sina.allyes.com/main/adfclick?db=sina&bid=113516,147455,152397&cid=0,0,0&sid=139545&advid=1039&camid=18960&show=ignore&url=http://gongyi.sina.com.cn/wenming2008/index.shtml "  target = " _blank " >< img src = " http://i0.sinaimg.cn/blog/z/henan/U1570P346T50D130F701DT20080428141602.gif "  width = " 150 "  height = " 20 "  border = " 0 " ></ a >
< a href = " http://sina.allyes.com/main/adfclick?db=sina&bid=3768,26807,26851&cid=0,0,0&sid=27056&advid=939&camid=3974&show=ignore&url=http://mall.sina.com.cn/SinaHtml/html/080401jx/index.html "  target = " _blank " >< img src = " http://i1.sinaimg.cn/home/2008-03-26/5/U210P30T5D5F746DT20080512095805.jpg "  width = " 135 "  height = " 51 "  alt = " 新浪商城 "   /></ a >
< a href = " http://gongyi.sina.com.cn/z/2008xzfx/index.html "  target = " _blank " >< img src = " http://i0.sinaimg.cn/dy/temp/450/2008/0326/U2769P1T450D19F10124DT20080508114248.jpg "  width = " 120 "  height = " 120 "  alt = " 图片 " ></ a >
< a href = " http://www.sina.net/ "  target = " _blank " >< img src = " http://i1.sinaimg.cn/home/deco/2008/0329/sinahome_0803_jingjia_002.gif "  width = " 29 "  height = " 15 "  alt = " 企业 "   /></ a >
< a href = " http://mag.sina.net/ "  target = " _blank "  style = " width:60px; " >< img src = " http://i0.sinaimg.cn/home/deco/2008/0329/sinahome_0803_jingjia_009.gif "  width = " 60 "  height = " 15 "  alt = " 电子杂志 "   /></ a >
< a href = " http://suqian.classad.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/2008qita/shouye-fenleirukou/08-4-24/sq.gif "  width = " 30 "  height = " 15 "  alt = " 宿迁 "   /></ a >
< a href = " http://bj.classad.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/2008qita/shouye-fenleirukou/08-4-24/bj.gif "  width = " 30 "  height = " 15 "  alt = " 北京 "   /></ a >
< a href = " http://sxxa.classad.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/2008qita/shouye-fenleirukou/08-4-24/xa.gif "  width = " 30 "  height = " 15 "  alt = " 西安 "   /></ a >
< a href = " http://scnch.classadnew.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/qiye2008/08-5-8/nc.jpg "  width = " 30 "  height = " 15 "  alt = " 南充 "   /></ a >
< a href = " http://hbwh.classad.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/2008qita/shouye-fenleirukou/08-4-24/wh.gif "  width = " 30 "  height = " 15 "  alt = " 武汉 "   /></ a >
< a href = " http://www.hd315.gov.cn/beian/view.asp?bianhao=0102000102300001 "  target = " _blank " >< img src = " http://i3.sinaimg.cn/home/07index/sinahome_wscfy_031.gif "  width = " 118 "  height = " 48 "  alt = " 经营性网站备案信息 "   /></ a >
< a href = " http://www.allyes.com "  target = " _blank " >< img src = " http://i2.sinaimg.cn/home/07index/sinahome_ws_032.gif "  width = " 118 "  height = " 48 "  alt = " 好耶广告网络 " /></ a >
< a href = " http://net.china.cn/chinese/index.htm "  target = " _blank " >< img src = " http://i3.sinaimg.cn/home/07index/sinahome_ws_033.gif "  width = " 118 "  height = " 48 "  alt = " 不良信息举报中心 "   /></ a >
< a href = " http://netbj.org.cn/index.asp "  target = " _blank " >< img src = " http://i0.sinaimg.cn/home/07index/sinahome_ws_034.gif "  width = " 118 "  height = " 48 "  alt = " 北京网络行业协会 "   /></ a >
< a href = " http://www.bj.cyberpolice.cn/index.htm "  target = " _blank " >< img src = " http://i1.sinaimg.cn/home/07index/sinahome_ws_035.gif "  width = " 118 "  height = " 48 "  alt = " 网络110报警服务 "   /></ a >
< a href = " http://www.ctws.com.cn "  target = " _blank " >< img src = " http://i2.sinaimg.cn/home/07index/sinahome_ws_036.gif "  width = " 118 "  height = " 48 "  alt = " 无线互联网业自律同盟 "   /></ a >
< a href = " +adurl100100+ "  target = ' _blank ' >< img src = " +adflash100100+ "  width = ' 100 '  height = ' 100 '  border = ' 0 '   /></ a >
如果直接进行解析(使用Parser parser=new Parser(" http://www.sina.com.cn");)的结果如下:
< a href = " http://cl.sina.com.cn/ " >< img src = " http://i1.sinaimg.cn/home/deco/2008/0329/sinahome_ws_037.gif "  width = " 27 "  height = " 15 "  alt = ""  style = " position:absolute;top:-2px;left:6px; "   /></ a >
< A HREF = " http://44.adsina.allyes.com/main/adfclick?user=AFP6_for_SINA|Home|HomePV&db=sina " >< IMG SRC = " http://44.adsina.allyes.com/main/adfshow?user=AFP6_for_SINA|Home|HomePV&db=sina "  WIDTH = 1  HEIGHT = 1  BORDER = 0 ></ a >
< a href = " http://sina.allyes.com/main/adfclick?db=sina&bid=66825,97935,98029&cid=0,0,0&sid=90934&advid=1832&camid=13706&show=ignore&url=http://61.49.38.5/tzhddoc/index.htm "  target = " _blank " >< img src = " http://i1.sinaimg.cn/home/deco/2008/0325/tmp001.gif "  width = " 120 "  height = " 20 "  alt = " 投资海淀 " /></ a >
< a href = " javascript:submitFormWithChannel('hplogo') " >< img src = " http://i1.sinaimg.cn/home/07index/home_google_logo_070702.gif "  width = " 94 "  height = " 23 "  alt = " Google " /></ a >
< a href = " http://sina.allyes.com/main/adfclick?db=sina&bid=113516,147454,152396&cid=0,0,0&sid=139544&advid=1039&camid=18960&show=ignore&url=http://my2008.sina.com.cn "  target = " _blank " >< img src = " http://i0.sinaimg.cn/dy/deco/2007/1119/my2008_title.jpg "  width = " 120 "  height = " 20 "  border = " 0 " ></ a >
< a href = " http://sina.allyes.com/main/adfclick?db=sina&bid=113516,147455,152397&cid=0,0,0&sid=139545&advid=1039&camid=18960&show=ignore&url=http://gongyi.sina.com.cn/wenming2008/index.shtml "  target = " _blank " >< img src = " http://i0.sinaimg.cn/blog/z/henan/U1570P346T50D130F701DT20080428141602.gif "  width = " 150 "  height = " 20 "  border = " 0 " ></ a >
< a href = " http://sina.allyes.com/main/adfclick?db=sina&bid=3768,26807,26851&cid=0,0,0&sid=27056&advid=939&camid=3974&show=ignore&url=http://mall.sina.com.cn/SinaHtml/html/080401jx/index.html "  target = " _blank " >< img src = " http://i1.sinaimg.cn/home/2008-03-26/5/U210P30T5D5F746DT20080512095805.jpg "  width = " 135 "  height = " 51 "  alt = " 新浪商城 "   /></ a >
< a href = " http://gongyi.sina.com.cn/z/2008xzfx/index.html "  target = " _blank " >< img src = " http://i0.sinaimg.cn/dy/temp/450/2008/0326/U2769P1T450D19F10124DT20080508114248.jpg "  width = " 120 "  height = " 120 "  alt = " 图片 " ></ a >
< a href = " http://www.sina.net/ "  target = " _blank " >< img src = " http://i1.sinaimg.cn/home/deco/2008/0329/sinahome_0803_jingjia_002.gif "  width = " 29 "  height = " 15 "  alt = " 企业 "   /></ a >
< a href = " http://mag.sina.net/ "  target = " _blank "  style = " width:60px; " >< img src = " http://i0.sinaimg.cn/home/deco/2008/0329/sinahome_0803_jingjia_009.gif "  width = " 60 "  height = " 15 "  alt = " 电子杂志 "   /></ a >
< a href = " http://suqian.classad.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/2008qita/shouye-fenleirukou/08-4-24/sq.gif "  width = " 30 "  height = " 15 "  alt = " 宿迁 "   /></ a >
< a href = " http://bj.classad.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/2008qita/shouye-fenleirukou/08-4-24/bj.gif "  width = " 30 "  height = " 15 "  alt = " 北京 "   /></ a >
< a href = " http://sxxa.classad.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/2008qita/shouye-fenleirukou/08-4-24/xa.gif "  width = " 30 "  height = " 15 "  alt = " 西安 "   /></ a >
< a href = " http://scnch.classadnew.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/qiye2008/08-5-8/nc.jpg "  width = " 30 "  height = " 15 "  alt = " 南充 "   /></ a >
< a href = " http://hbwh.classad.sina.com.cn "  target = " _blank " >< img src = " http://d1.sina.com.cn/litong/2008qita/shouye-fenleirukou/08-4-24/wh.gif "  width = " 30 "  height = " 15 "  alt = " 武汉 "   /></ a >
< a href = " http://www.hd315.gov.cn/beian/view.asp?bianhao=0102000102300001 "  target = " _blank " >< img src = " http://i3.sinaimg.cn/home/07index/sinahome_wscfy_031.gif "  width = " 118 "  height = " 48 "  alt = " 经营性网站备案信息 "   /></ a >
< a href = " http://www.allyes.com "  target = " _blank " >< img src = " http://i2.sinaimg.cn/home/07index/sinahome_ws_032.gif "  width = " 118 "  height = " 48 "  alt = " 好耶广告网络 " /></ a >
< a href = " http://net.china.cn/chinese/index.htm "  target = " _blank " >< img src = " http://i3.sinaimg.cn/home/07index/sinahome_ws_033.gif "  width = " 118 "  height = " 48 "  alt = " 不良信息举报中心 "   /></ a >
< a href = " http://netbj.org.cn/index.asp "  target = " _blank " >< img src = " http://i0.sinaimg.cn/home/07index/sinahome_ws_034.gif "  width = " 118 "  height = " 48 "  alt = " 北京网络行业协会 "   /></ a >
< a href = " http://www.bj.cyberpolice.cn/index.htm "  target = " _blank " >< img src = " http://i1.sinaimg.cn/home/07index/sinahome_ws_035.gif "  width = " 118 "  height = " 48 "  alt = " 网络110报警服务 "   /></ a >
< a href = " http://www.ctws.com.cn "  target = " _blank " >< img src = " http://i2.sinaimg.cn/home/07index/sinahome_ws_036.gif "  width = " 118 "  height = " 48 "  alt = " 无线互联网业自律同盟 "   /></ a >
< a href = " +adurl100100+ "  target = ' _blank ' >< img src = " +adflash100100+ "  width = ' 100 '  height = ' 100 '  border = ' 0 '   /></ a >

对比了一下,好像是一样的啊,呵呵……

 

 

//  $Author: sunnydream $
//  $Date: 2008-05-10 10:44:17 $
//  $Revision: 4 $
package  org.htmlparser;

import  java.io.Serializable;

/**
 * Implement this interface to select particular nodes.
 
*/
public   interface  NodeFilter
    
extends
        Serializable,
        Cloneable
{

    
boolean  accept (Node node);
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值