需要的HTMLParser包到这里下载
HTML有三种类型的Node的节点,RemarkNode(Html中的注释)、TagNode(标签节点)、TextNode(文本节点)
lexer包里面对HTML经ixngle4级封装,Stream, Source, Page and Lexer                                                                     The package is arranged in four levels, Stream, Source Page and Lexer in the order of lowest to highest. A Stream is raw bytes from the URLConnection or file. It has no intelligence. A Source is raw characters, hence it knows about the encoding scheme used and can be reset if a different encoding is detected after partially reading in the text. A Page provides characters from the source while maintaining the index of line numbers, and hence can be thought of as an array of strings corresponding to source file lines, but it doesn''t actually store any text, relying on the buffering within the Source instead. The Lexer contains the actual lexeme parsing code. It reads characters from the page, keeping track of where it is with a Cursor and creates the array of nodes using various state machines。
State Machines
       The Lexer has the following state machines:
  • in text - parseString()
  • in comment - parseRemark()
  • in tag - parseTag()
  • in JSP tag - parseJsp()
There is another state machine -- parseCDATA -- used by higher level code (script and style scanners), but this isn''t actually used by the lexer. 更多内容请到 [url]http://htmlparser.sourceforge.net/javadoc/org/htmlparser/lexer/package-summary.html[/url]
超简短引入包


import java.io.*;
import java.net.*;
import java.util.Vector;

import org.htmlparser.*;
import org.htmlparser.util.*;
import org.htmlparser.http.*;
import org.htmlparser.lexer.*; 具体全面包

import java.io.Serializable;
import java.net.HttpURLConnection;
import java.net.URLConnection;

import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.http.ConnectionMonitor;
import org.htmlparser.http.HttpHeader;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.IteratorImpl;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;
import org.htmlparser.visitors.NodeVisitor;

import org.htmlparser.lexer.Page;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor; Lexer类中最重要的一个类nextNode
public class LexerExtratText
{
         protected Page mPage;
         protected Cursor mCursor;
         protected static int mDebugLineTrigger = -1;
        
         public Page getPage ()
         {
                 return (mPage);
         }
         public void setPage (Page page)
         {
                 if ( null == page)
                         throw new IllegalArgumentException ( "page cannot be null");
                 mPage = page;
         }
        
         public Node nextNode( boolean quotesmart) throws ParserException    
         //这里一直有一个低级的错误,那就是没return值;
        {
                 //记录起始位置
                 int start;
                 //记录读取的字符
                 char ch;
                 //读取的节点
                Node ret;
                 //判断是否为Debug模式
                 if(mDebugLineTrigger!=-1)
                {
                            Page page = getPage ();
                                 int lineno = page.row (mCursor);
                             if (mDebugLineTrigger < lineno)
                                         mDebugLineTrigger = lineno + 0">1;        
                }
                 //取得当前字符的位置
                start=mCursor.getPosition();
                 //取得当前的字符
                ch=mPage.getCharacter(mCursor);
                 switch(ch)
                {
                 // 判断是否到页尾
                 case Page.EOF:
                        ret= null;
                         break;
                 //判断是否为一个标签的开始符
                 case ''<'':
                         //读取下一个字符
                        ch=mPage.getCharacter(mCursor);
                         //判断是否到尾,若是则返回一个String Node
                         if(Page.EOF==ch)
                        {
                                ret=makeString(start,mCursor.getPosition());
                        }
                         //如果是jsp,则回退一个字符,进入parseJsp 的状态机
                         else if(ch==''%'')
                        {
                                mPage.ungetCharacter(mCursor);
                                ret=parseJsp(start);
                        }
                         //如果是xml,则进入parsePI 的状态机
                         else if(ch==''?'')
                        {
                                mPage.ungetCharacter(mCursor);
                                ret=parsePI(start);
                        }
                         //如果是’/''或字母,则进入parseTag的状态机
                         else if(ch==''/''||ch==''%''||Character.isLetter(ch))
                        {
                                mPage.ungetCharacter(mCursor);
                                ret=parseTag(start);
                        }
                         //如果是"!"则进入注释标签
                         else if(ch==''!'')
                        {
                                ch=mPage.getCharacter(mCursor);
                                 if(Page.EOF==ch)
                                        ret=makeString(start,mCursor.getPosition());
                                 else
                                {
                                         if(ch==''>'')
                                                ret=makeRemark(start,mCursor.getPosition());
                                         else
                                        {
                                                 //进入Remark/Tag状态机,需要回退一个字符
                                                mPage.ungetCharacter(mCursor);
                                                 if(ch==''-'')
                                                        ret=parseRemark(start,quotesmart);
                                                 else
                                                {
                                                        mPage.ungetCharacter(mCursor);
                                                        ret">=parseTag(start);
                                                }
                                        }
                                }
                        }
                         //进入parserString状态机
                         else
                                ret=parserString(start,quotesmart);
                         break;
                         default:
                                mPage.ungetCharacter(mCursor);
                                ret=parseString(start,quotesmart);
                         break;
                }
                 return ret;
        } 加入main运行

public static void main(String[] args)
        {
             ConnectionManager manager= null;
             Lexer lexer;
             Node node;
             try
             {
                     manager=Page.getConnectionManager();
                     lexer= new Lexer(manager.openConnection( "http://astro.sina.com.cn/sagittarius.html"));
                     while(null!=(node=lexer.nextNode(false)))
                     {
                             System.out.println(node.toString());
                     }
             }
             catch(ParserException e)
             {
                     e.printStackTrace();
             }
        }



解析结果

Tag (0[0,0],6[0,6]): HTML
Tag (6[0,6],12[0,12]): HEAD
Txt (12[0,12],13[1,0]):
Tag (13[1,0],20[1,7]): TITLE
End (20[1,7],28[1,15]): /TITLE
Txt (28[1,15],29[2,0]):
Tag (29[2,0],97[2,68]): meta http-equiv="Content-Type" content="text/html; ch...
Txt (97[2,68],98[3,0]):
Tag (98[3,0],200[3,102]): META http-equiv="refresh" content ="0;url=''http://a...
Txt (200[3,102],201[4,0]):
End (201[4,0],208[4,7]): /HEAD
Txt (208[4,7],209[5,0]):
Tag (209[5,0],215>[5,6]): body
Rem (215[5,6],239[5,30]):    SUDA_CODE_START
Txt (239[5,30],240[6,0]):
Tag (240[6,0],348[6,108]): div id=sudsclickstreamdiv style=''position:absolute...
End (348[6,108],354[6,114]): /div
Txt (354[6,114],355[7,0]):
Tag (355[7,0],386[7,31]): script type="text/javascript"
Txt (386[7,31],389[8,2]):    //
Rem (389[8,2],5901[31,5]):    var _S_JV_="webbug_meta_ref_mod_noiframe_async_:...
Txt (5901[31,5],5902[32,0]):
End (5902[32,0],5911[32,9]): /script
Txt (000000">5911[32,9],5912[33,0]):
Tag (5912[33,0],5943[33,31]): script type="text/javascript"
Txt (5943[33,31],5946[34,2]):    //
Rem (5946[34,2],5968[36,5]):    _S_pSt(""); //
Txt (5968[36,5],5969[37,0]):
End (5969[37,0],5978[37,9]): /script
Txt (5978[37,9],5979[38,0]):
Rem (5979[38,0],6001[38,22]):    SUDA_CODE_END
Txt (6001[38,22],6005[41,0]):    
Rem (6005[41,0],6029[41,24]):    Start    Wrating    
Txt (6029[41,24],6031[43,0]):
Tag (6031[43,0],6061[43,30]): script language="javascript"
Txt (6061[43,30],9627[44,3565]):    var wrUrl="//sina.wrating.com/";var wrDoma...
Tag (9627[44,3565],9667[44,3605]): img src="''+V+''" width="1" height="1" /
Txt (9667[44,3605],10863[44,4801]): '')}function vjGetAcc(){var B=document.loc...
Tag (10863[44,4801],11245[45,0]): A){var D=E+F;if(document.cookie.substring(E...
End (11245[45,0],11254[45,9]): /script
Txt (11254[45,9],11256[47,0]):
Rem (11256[47,0],11275[47,19]):    End Wrating
Txt (11275[47,19],11277[49,0]):
Rem (11277[<span style="COLOR: #000000">49,0],11327[49,50]):    START Nielsen//NetRatings SiteCensus V5.2
Txt (11327[49,50],11328[50,0]):
Rem (11328[50,0],11371[50,43]):    COPYRIGHT 2006 Nielsen//NetRatings
Txt (11371[50,43],11372[51,0]):
Tag (11372[51,0],11403[51,31]): script type="text/javascript"
Txt (11403[51,31],11562[59,0]):    var _rsCI="cn-sina2006";     var _rsCG=...
End (11562[59,0],11571[59,9]): /script
Txt (11571[59,9],11572[60,0]):
Tag (11572[60,0],11645[60,73]): script type="text/javascript" src="//secure-c...
End (11645[60,73],11654[60,82]): /script
Txt (11654[60,82],11655[61,0]):
Tag (11655[61,0],11665[61,10]): noscript
Txt (11665[61,10],11667[62,1]):
Tag (11667[62,1],11749[62,83]): img src="//secure-cn.imrworldwide.com/cgi-bin...
Txt (11749[62,83],11750[63,0]):
End (11750[63,0],11761[63,11]): /noscript
Txt (11761[63,11],11762[64,0]):
Rem (11762[64,0],11810[64,48]):    END Nielsen//NetRatings SiteCensus V5.2
Txt (11810[64,48],11811[65,0]):
End (11811[65,0],11818[65,7]): /body
End (11818[65,7],11825[65,14]): /html

用这个代码肯定还不行,下一篇给出它的全代码

OK