HtmlParser类的说明

最新推荐文章于 2024-04-09 21:53:32 发布

caoxu1987728

最新推荐文章于 2024-04-09 21:53:32 发布

阅读量781

点赞数

分类专栏： All Spiders 文章标签： buffer null

本文链接：https://blog.csdn.net/caoxu1987728/article/details/3009972

版权

All Spiders 专栏收录该内容

42 篇文章 0 订阅

订阅专栏

HtmlParser类：

这是一个处理网页源文件的类。这里涉及到了比较底层的东西。由于现在无法调试，而且无碍大局，暂时就不深究它了。

没事的时候可以看看：

 
 /**
 * The HTMLParse class is used to parse an HTML page.  It is
 * just a utility class, and does NOT store any values.
 */
package com.heaton.bot;
import com.heaton.bot.*;
public class HTMLParser extends Parse {
    
  public HTMLTag getTag()
  {
    int i;
    HTMLTag tag = new HTMLTag();
    tag.setName(this.tag);
    /*
     * 派生类中的对象可以直接调用其父类的方法
     */
    for ( i=0;i<vec.size();i++ )
      tag.add( (Attribute)get(i).clone() );
    return tag;
  }
  public String buildTag()
  {
    String buffer="<";
    buffer+=tag;
    int i=0;
    while ( get(i)!=null ) 
    {// has attributes 
      buffer+=" ";
      if ( get(i).getValue() == null ) 
      {
        if ( get(i).getDelim()!=0 )
          buffer+=get(i).getDelim();
        buffer+=get(i).getName();
        if ( get(i).getDelim()!=0 )
          buffer+=get(i).getDelim();
      } 
      else 
      {
        buffer+=get(i).getName();
        if ( get(i).getValue()!=null ) {
          buffer+="=";
          if ( get(i).getDelim()!=0 )
            buffer+=get(i).getDelim();
          buffer+=get(i).getValue();
          if ( get(i).getDelim()!=0 )
            buffer+=get(i).getDelim();
        }
      }
      i++;
    }
    buffer+=">";
    return buffer;
  }
  protected void parseTag()
  {
    idx++;
    tag="";
    clear();
    
    /*
     * Is it a comment?
     * 如果是注释。
     */
    if ( (source.charAt(idx)=='!') &
         (source.charAt(idx+1)=='-')&
         (source.charAt(idx+2)=='-') ) 
    {
      while ( !eof() ) 
      {
        if ( (source.charAt(idx)=='-') &
             (source.charAt(idx+1)=='-')&
             (source.charAt(idx+2)=='>') )
          break;
        if ( source.charAt(idx)!='/r' )
          tag+=source.charAt(idx);
        idx++;
      }
      tag+="--";
      idx+=3;
      parseDelim=0;
      return;
    }
    // Find the tag name 
    while ( !eof() ) 
    {
      if ( isWhiteSpace(source.charAt(idx)) || 
              (source.charAt(idx)=='>') )
        break;
      tag+=source.charAt(idx);
      idx++;
    }
    eatWhiteSpace();
    // get the attributes 
    while ( source.charAt(idx)!='>' ) {
      parseName = "";
      parseValue = "";
      parseDelim=0;
      parseAttributeName();
      if( eof() )
          break;
      
      if ( source.charAt(idx)=='>' ) {
        addAttribute();
        break;
      }
      // get the value(if any) 
      parseAttributeValue();
      addAttribute();
    }
    idx++;
  }
  public char get()
  {
    if ( source.charAt(idx)=='<' ) {
      char ch=Character.toUpperCase(source.charAt(idx+1));
      if ( (ch>='A') && (ch<='Z') || (ch=='!') || (ch=='/') ) {
        parseTag();
        return 0;
      } else return(source.charAt(idx++));
    } else return(source.charAt(idx++));
  }
}