HTMLParser之Lexer的功能实现

需要的HTMLParser包到这里下载

http://sourceforge.net/project/showfiles.php?group_id=24399

HTML有三种类型的Node的节点,RemarkNode(Html中的注释)、TagNode(标签节点)、TextNode(文本节点)

lexer包里面对HTML经ixngle4级封装,Stream, Source, Page and Lexer 

The package is arranged in four levels, Stream, Source Page and Lexer in the order of lowest to highest. A Stream is raw bytes from the URLConnection or file. It has no intelligence. A Source is raw characters, hence it knows about the encoding scheme used and can be reset if a different encoding is detected after partially reading in the text. A Page provides characters from the source while maintaining the index of line numbers, and hence can be thought of as an array of strings corresponding to source file lines, but it doesn't actually store any text, relying on the buffering within the Source instead. The Lexer contains the actual lexeme parsing code. It reads characters from the page, keeping track of where it is with a Cursor and creates the array of nodes using various state machines。

State Machines

       The Lexer has the following state machines:

  • in text - parseString()
  • in comment - parseRemark()
  • in tag - parseTag()
  • in JSP tag - parseJsp()

There is another state machine -- parseCDATA -- used by higher level code (script and style scanners), but this isn't actually used by the lexer. 更多内容请到http://htmlparser.sourceforge.net/javadoc/org/htmlparser/lexer/package-summary.html

超简短引入包

import  java.io. * ;
import  java.net. * ;
import  java.util.Vector;

import  org.htmlparser. * ;
import  org.htmlparser.util. * ;
import  org.htmlparser.http. * ;
import  org.htmlparser.lexer. * ;
具体全面包
import  java.io.Serializable;
import  java.net.HttpURLConnection;
import  java.net.URLConnection;

import  org.htmlparser.filters.TagNameFilter;
import  org.htmlparser.filters.NodeClassFilter;
import  org.htmlparser.http.ConnectionManager;
import  org.htmlparser.http.ConnectionMonitor;
import  org.htmlparser.http.HttpHeader;
import  org.htmlparser.lexer.Lexer;
import  org.htmlparser.lexer.Page;
import  org.htmlparser.util.DefaultParserFeedback;
import  org.htmlparser.util.IteratorImpl;
import  org.htmlparser.util.NodeIterator;
import  org.htmlparser.util.NodeList;
import  org.htmlparser.util.ParserException;
import  org.htmlparser.util.ParserFeedback;
import  org.htmlparser.visitors.NodeVisitor;

import  org.htmlparser.lexer.Page;
import  org.htmlparser.util.NodeList;
import  org.htmlparser.util.ParserException;
import  org.htmlparser.visitors.NodeVisitor;
Lexer类中最重要的一个类nextNode
public   class  LexerExtratText 
{
     
protected  Page mPage;
     
protected  Cursor mCursor;
     
protected   static   int  mDebugLineTrigger  =   - 1 ;
     
     
public  Page getPage ()
     {
         
return  (mPage);
     }
     
public   void  setPage (Page page)
     {
         
if  ( null   ==  page)
             
throw   new  IllegalArgumentException ( " page cannot be null " );
         mPage 
=  page;
     }
     
    
public  Node nextNode( boolean  quotesmart) throws  ParserException   
    
// 这里一直有一个低级的错误,那就是没return值;
    {
        
// 记录起始位置
         int  start;
        
// 记录读取的字符
         char  ch;
        
// 读取的节点
        Node ret;
        
// 判断是否为Debug模式
         if (mDebugLineTrigger !=- 1 )
        {
              Page page 
=  getPage ();
                 
int  lineno  =  page.row (mCursor);
              
if  (mDebugLineTrigger  <  lineno)
                     mDebugLineTrigger 
=  lineno  +   1 ;    
        }
        
// 取得当前字符的位置
        start = mCursor.getPosition();
        
// 取得当前的字符
        ch = mPage.getCharacter(mCursor);
        
switch (ch)
        {
        
//  判断是否到页尾
         case  Page.EOF:
            ret
= null ;
            
break ;
        
// 判断是否为一个标签的开始符
         case   ' < ' :
            
// 读取下一个字符
            ch = mPage.getCharacter(mCursor);
            
// 判断是否到尾,若是则返回一个String Node
             if (Page.EOF == ch)
            {
                ret
= makeString(start,mCursor.getPosition());
            }
            
// 如果是jsp,则回退一个字符,进入parseJsp 的状态机
             else   if (ch == ' % ' )
            {
                mPage.ungetCharacter(mCursor);
                ret
= parseJsp(start);
            }
            
// 如果是xml,则进入parsePI 的状态机
             else   if (ch == ' ? ' )
            {
                mPage.ungetCharacter(mCursor);
                ret
= parsePI(start);
            }
            
// 如果是’/'或字母,则进入parseTag的状态机
             else   if (ch == ' / ' || ch == ' % ' || Character.isLetter(ch))
            {
                mPage.ungetCharacter(mCursor);
                ret
= parseTag(start);
            }
            
// 如果是"!"则进入注释标签
             else   if (ch == ' ! ' )
            {
                ch
= mPage.getCharacter(mCursor);
                
if (Page.EOF == ch)
                    ret
= makeString(start,mCursor.getPosition());
                
else
                {
                    
if (ch == ' > ' )
                        ret
= makeRemark(start,mCursor.getPosition());
                    
else
                    {
                        
// 进入Remark/Tag状态机,需要回退一个字符
                        mPage.ungetCharacter(mCursor);
                        
if (ch == ' - ' )
                            ret
= parseRemark(start,quotesmart);
                        
else
                        {
                            mPage.ungetCharacter(mCursor);
                            ret
= parseTag(start);
                        }
                    }
                }
            }
            
// 进入parserString状态机
             else
                ret
= parserString(start,quotesmart);
            
break ;
            
default :
                mPage.ungetCharacter(mCursor);
                ret
= parseString(start,quotesmart);
            
break ;
        }
        
return  ret;
    }
加入main运行
public   static   void  main(String[] args)
    {
       ConnectionManager manager
= null ;
       Lexer lexer;
       Node node;
       
try
       {
           manager
= Page.getConnectionManager();
           lexer
= new  Lexer(manager.openConnection( " http://astro.sina.com.cn/sagittarius.html " ));
           
while ( null != (node = lexer.nextNode( false )))
           {
               System.out.println(node.toString());
           }
       }
       
catch (ParserException e)
       {
           e.printStackTrace();
       }
    }
解析结果
Tag ( 0 [ 0 , 0 ], 6 [ 0 , 6 ]): HTML
Tag (
6 [ 0 , 6 ], 12 [ 0 , 12 ]): HEAD
Txt (
12 [ 0 , 12 ], 13 [ 1 , 0 ]): 
Tag (
13 [ 1 , 0 ], 20 [ 1 , 7 ]): TITLE
End (
20 [ 1 , 7 ], 28 [ 1 , 15 ]):  / TITLE
Txt (
28 [ 1 , 15 ], 29 [ 2 , 0 ]): 
Tag (
29 [ 2 , 0 ], 97 [ 2 , 68 ]): meta http - equiv = " Content-Type "  content = " text/html; ch...
Txt ( 97 [ 2 , 68 ], 98 [ 3 , 0 ]): 
Tag (
98 [ 3 , 0 ], 200 [ 3 , 102 ]): META http - equiv = " refresh "  content  = " 0;url='http://a...
Txt ( 200 [ 3 , 102 ], 201 [ 4 , 0 ]): 
End (
201 [ 4 , 0 ], 208 [ 4 , 7 ]):  / HEAD
Txt (
208 [ 4 , 7 ], 209 [ 5 , 0 ]): 
Tag (
209 [ 5 , 0 ], 215 [ 5 , 6 ]): body
Rem (
215 [ 5 , 6 ], 239 [ 5 , 30 ]):  SUDA_CODE_START 
Txt (
239 [ 5 , 30 ], 240 [ 6 , 0 ]): 
Tag (
240 [ 6 , 0 ], 348 [ 6 , 108 ]): div id = sudsclickstreamdiv style = ' position:absolute...
End ( 348 [ 6 , 108 ], 354 [ 6 , 114 ]):  / div
Txt (
354 [ 6 , 114 ], 355 [ 7 , 0 ]): 
Tag (
355 [ 7 , 0 ], 386 [ 7 , 31 ]): script type = " text/javascript "
Txt (
386 [ 7 , 31 ], 389 [ 8 , 2 ]):  //
Rem ( 389 [ 8 , 2 ], 5901 [ 31 , 5 ]):  var _S_JV_ = " webbug_meta_ref_mod_noiframe_async_:...
Txt ( 5901 [ 31 , 5 ], 5902 [ 32 , 0 ]): 
End (
5902 [ 32 , 0 ], 5911 [ 32 , 9 ]):  / script
Txt (
5911 [ 32 , 9 ], 5912 [ 33 , 0 ]): 
Tag (
5912 [ 33 , 0 ], 5943 [ 33 , 31 ]): script type = " text/javascript "
Txt (
5943 [ 33 , 31 ], 5946 [ 34 , 2 ]):  //
Rem ( 5946 [ 34 , 2 ], 5968 [ 36 , 5 ]):  _S_pSt( "" ); //
Txt ( 5968 [ 36 , 5 ], 5969 [ 37 , 0 ]): 
End (
5969 [ 37 , 0 ], 5978 [ 37 , 9 ]):  / script
Txt (
5978 [ 37 , 9 ], 5979 [ 38 , 0 ]): 
Rem (
5979 [ 38 , 0 ], 6001 [ 38 , 22 ]):  SUDA_CODE_END 
Txt (
6001 [ 38 , 22 ], 6005 [ 41 , 0 ]):   
Rem (
6005 [ 41 , 0 ], 6029 [ 41 , 24 ]):  Start  Wrating  
Txt (
6029 [ 41 , 24 ], 6031 [ 43 , 0 ]): 
Tag (
6031 [ 43 , 0 ], 6061 [ 43 , 30 ]): script language = " javascript "
Txt (
6061 [ 43 , 30 ], 9627 [ 44 , 3565 ]):  var wrUrl = " //sina.wrating.com/ " ;var wrDoma...
Tag (
9627 [ 44 , 3565 ], 9667 [ 44 , 3605 ]): img src = " '+V+' "  width = " 1 "  height = " 1 "   /
Txt (
9667 [ 44 , 3605 ], 10863 [ 44 , 4801 ]):  ' )}function vjGetAcc(){var B=document.loc...
Tag ( 10863 [ 44 , 4801 ], 11245 [ 45 , 0 ]): A){var D = E + F; if (document.cookie.substring(E...
End (
11245 [ 45 , 0 ], 11254 [ 45 , 9 ]):  / script
Txt (
11254 [ 45 , 9 ], 11256 [ 47 , 0 ]): 
Rem (
11256 [ 47 , 0 ], 11275 [ 47 , 19 ]):  End Wrating
Txt (
11275 [ 47 , 19 ], 11277 [ 49 , 0 ]): 
Rem (
11277 [ 49 , 0 ], 11327 [ 49 , 50 ]):  START Nielsen // NetRatings SiteCensus V5.2 
Txt ( 11327 [ 49 , 50 ], 11328 [ 50 , 0 ]): 
Rem (
11328 [ 50 , 0 ], 11371 [ 50 , 43 ]):  COPYRIGHT  2006  Nielsen // NetRatings 
Txt ( 11371 [ 50 , 43 ], 11372 [ 51 , 0 ]): 
Tag (
11372 [ 51 , 0 ], 11403 [ 51 , 31 ]): script type = " text/javascript "
Txt (
11403 [ 51 , 31 ], 11562 [ 59 , 0 ]):  var _rsCI = " cn-sina2006 " ;   var _rsCG = ...
End (
11562 [ 59 , 0 ], 11571 [ 59 , 9 ]):  / script
Txt (
11571 [ 59 , 9 ], 11572 [ 60 , 0 ]): 
Tag (
11572 [ 60 , 0 ], 11645 [ 60 , 73 ]): script type = " text/javascript "  src = " //secure-c...
End ( 11645 [ 60 , 73 ], 11654 [ 60 , 82 ]):  / script
Txt (
11654 [ 60 , 82 ], 11655 [ 61 , 0 ]): 
Tag (
11655 [ 61 , 0 ], 11665 [ 61 , 10 ]): noscript
Txt (
11665 [ 61 , 10 ], 11667 [ 62 , 1 ]): 
Tag (
11667 [ 62 , 1 ], 11749 [ 62 , 83 ]): img src = " //secure-cn.imrworldwide.com/cgi-bin...
Txt ( 11749 [ 62 , 83 ], 11750 [ 63 , 0 ]): 
End (
11750 [ 63 , 0 ], 11761 [ 63 , 11 ]):  / noscript
Txt (
11761 [ 63 , 11 ], 11762 [ 64 , 0 ]): 
Rem (
11762 [ 64 , 0 ], 11810 [ 64 , 48 ]):  END Nielsen // NetRatings SiteCensus V5.2 
Txt ( 11810 [ 64 , 48 ], 11811 [ 65 , 0 ]): 
End (
11811 [ 65 , 0 ], 11818 [ 65 , 7 ]):  / body
End (
11818 [ 65 , 7 ], 11825 [ 65 , 14 ]):  / html

用这个代码肯定还不行,下一篇给出它的全代码

OK

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值