一个C#分析html的包，从老外的一个Spider程序里提取出来的

最新推荐文章于 2023-11-17 08:55:39 发布

hflkl1314

最新推荐文章于 2023-11-17 08:55:39 发布

阅读量1.4k

点赞数

分类专栏： c# 文章标签： html c# attributes string character buffer

本文链接：https://blog.csdn.net/hflkl1314/article/details/6911411

版权

c# 专栏收录该内容

30 篇文章 0 订阅

订阅专栏

一个C#分析html的包，以前思考如何分析html，总是毫无头绪，情况太多，太复杂。后来在一个老外写的Spider程序里发现了这个。非常好用，屡试不爽，现在才知道自己写程序还在一个很低的层次，尤其是面向对象思想的理解。利用继承的思想来分解问题，利用多个类来实现多种情况的变化，这个才是OO思想，而自己很多时候只是为了OO而去OO,其实水平还是停留在面向过程里。好好分析下这些代码，受益颇多。

/// <summary>
/// Attribute holds one attribute, as is normally stored in
/// an HTML or XML file. This includes a name, value and delimiter.
/// 
/// This spider is copyright 2003 by Jeff Heaton. However, it is
/// released under a Limited GNU Public License (LGPL). You may
/// use it freely in your own programs. For the latest version visit
/// http://www.jeffheaton.com.
///
/// </summary>
public class Attribute: ICloneable
{
   /// <summary>
   /// The name of this attribute
   /// </summary>
   private string m_name;

   /// <summary>
   /// The value of this attribute
   /// </summary>
   private string m_value;

  
   /// <summary>
   /// The delimiter for the value of this
   /// attribute(i.e. " or ').
   /// </summary>
   private char m_delim;


   /// <summary>
   /// Construct a new Attribute. The name, delim and value
   /// properties can be specified here.
   /// </summary>
   /// <param name="name">The name of this attribute.</param>
   /// <param name="value">The value of this attribute.</param>
   /// <param name="delim">The delimiter character for the value.</param>
   public Attribute(string name,string value,char delim)
   {
    m_name = name;
    m_value = value;
    m_delim = delim;
   }

  

   /// <summary>
   /// The default constructor. Construct a blank attribute.
   /// </summary>
   public Attribute():this("","",(char)0)
   {
   }


   /// <summary>
   /// Construct an attribute without a delimiter.
   /// </summary>
   /// <param name="name">The name of this attribute.</param>
   /// <param name="value">The value of this attribute.</param>
   public Attribute(String name,String value):this(name,value,(char)0)
   {
   }

   /// <summary>
   /// The delimiter for this attribute.
   /// </summary>
   public char Delim
   {
    get
    {
     return m_delim;
    }

    set
    {
     m_delim = value;
    }
   }


   /// <summary>
   /// The name for this attribute.
   /// </summary>
   public string Name
   {
    get
    {
     return m_name;
    }

    set
    {
     m_name = value;
    }
   }

   /// <summary>
   /// The value for this attribute.
   /// </summary>
   public string Value
   {
    get
    {
     return m_value;
    }

    set
    {
     m_value = value;
    }
   }


   #region ICloneable Members
   public virtual object Clone()
   {
    return new Attribute(m_name,m_value,m_delim);  
   }
   #endregion
}

以上是一个基础类，专门用来分析html属性的。

public class AttributeList:Attribute
{

   /// <summary>
   /// An internally used Vector. This vector contains
   /// the entire list of attributes.
   /// </summary>
   protected ArrayList m_list;

   /// <summary>
   /// Make an exact copy of this object using the cloneable interface.
   /// </summary>
   /// <returns>A new object that is a clone of the specified object.</returns>
   public override Object Clone()
   {
    AttributeList rtn = new AttributeList();   

    for ( int i=0;i<m_list.Count;i++ )
     rtn.Add( (Attribute)this[i].Clone() );

    return rtn;
   }

   /// <summary>
   /// Create a new, empty, attribute list.
   /// </summary>
   public AttributeList():base("","")
   {
    m_list = new ArrayList();
   }


   /// <summary>
   /// Add the specified attribute to the list of attributes.
   /// </summary>
   /// <param name="a">An attribute to add to this AttributeList.</param>
   public void Add(Attribute a)
   {
    m_list.Add(a);
   }


   /// <summary>
   /// Clear all attributes from this AttributeList and return it
   /// to a empty state.
   /// </summary>
   public void Clear()
   {
    m_list.Clear();
   }

   /// <summary>
   /// Returns true of this AttributeList is empty, with no attributes.
   /// </summary>
   /// <returns>True if this AttributeList is empty, false otherwise.</returns>
   public bool IsEmpty()
   {
    return( m_list.Count<=0);
   }

   /// <summary>
   /// If there is already an attribute with the specified name,
   /// then it will have its value changed to match the specified value.
   /// If there is no Attribute with the specified name, then one will
   /// be created. This method is case-insensitive.
   /// </summary>
   /// <param name="name">The name of the Attribute to edit or create. Case-insensitive.</param>
   /// <param name="value">The value to be held in this attribute.</param>
   public void Set(string name,string value)
   {
    if ( name==null )
     return;
    if ( value==null )
     value="";

    Attribute a = this[name];

    if ( a==null )
    {
     a = new Attribute(name,value);
     Add(a);
    }
    else
     a.Value = value;
   }

   /// <summary>
   /// How many attributes are in this AttributeList
   /// </summary>
   public int Count
   {
    get
    {
     return m_list.Count;
    }
   }

   /// <summary>
   /// A list of the attributes in this AttributeList
   /// </summary>
   public ArrayList List
   {
    get
    {
     return m_list;
    }
   }

   /// <summary>
   /// Access the individual attributes
   /// </summary>
   public Attribute this[int index]
   {
    get
    {
     if ( index<m_list.Count )
      return(Attribute)m_list[index];
     else
      return null;
    }
   }

   /// <summary>
   /// Access the individual attributes by name.
   /// </summary>
   public Attribute this[string index]
   {
    get
    {
     int i=0;

     while ( this[i]!=null )
     {
      if ( this[i].Name.ToLower().Equals( (index.ToLower()) ))
       return this[i];
      i++;
     }

     return null;

    }
   }
}

以上是一个属性列表。

public class Parse:AttributeList
{
   /// <summary>
   /// The source text that is being parsed.
   /// </summary>
   private string m_source;

   /// <summary>
   /// The current position inside of the text that
   /// is being parsed.
   /// </summary>
   private int m_idx;

   /// <summary>
   /// The most reciently parsed attribute delimiter.
   /// </summary>
   private char m_parseDelim;

   /// <summary>
   /// This most receintly parsed attribute name.
   /// </summary>
   private string m_parseName;

   /// <summary>
   /// The most reciently parsed attribute value.
   /// </summary>
   private string m_parseValue;

   /// <summary>
   /// The most reciently parsed tag.
   /// </summary>
   public string m_tag;

   /// <summary>
   /// Determine if the specified character is whitespace or not.
   /// </summary>
   /// <param name="ch">A character to check</param>
   /// <returns>true if the character is whitespace</returns>
   public static bool IsWhiteSpace(char ch)
   {
    return( "\t\n\r ".IndexOf(ch) != -1 );
   }


   /// <summary>
   /// Advance the index until past any whitespace.
   /// </summary>
   public void EatWhiteSpace()
   {
    while ( !Eof() )
    {
     if ( !IsWhiteSpace(GetCurrentChar()) )
      return;
     m_idx++;
    }
   }

   /// <summary>
   /// Determine if the end of the source text has been
   /// reached.
   /// </summary>
   /// <returns>True if the end of the source text has been
   /// reached.</returns>
   public bool Eof()
   {
    return(m_idx>=m_source.Length );
   }

   /// <summary>
   /// Parse the attribute name.
   /// </summary>
   public void ParseAttributeName()
   {
    EatWhiteSpace();
    // get attribute name
    while ( !Eof() )
    {
     if ( IsWhiteSpace(GetCurrentChar()) ||
      (GetCurrentChar()=='=') ||
      (GetCurrentChar()=='>') )
      break;
     m_parseName+=GetCurrentChar();
     m_idx++;
    }

    EatWhiteSpace();
   }


   /// <summary>
   /// Parse the attribute value
   /// </summary>
   public void ParseAttributeValue()
   {
    if ( m_parseDelim!=0 )
     return;

    if ( GetCurrentChar()=='=' )
    {
     m_idx++;
     EatWhiteSpace();
     if ( (GetCurrentChar()=='\'') ||
      (GetCurrentChar()=='\"') )
     {
      m_parseDelim = GetCurrentChar();
      m_idx++;
      while ( GetCurrentChar()!=m_parseDelim )
      {
       m_parseValue+=GetCurrentChar();
       m_idx++;
      }
      m_idx++;
     }
     else
     {
      while ( !Eof() &&
       !IsWhiteSpace(GetCurrentChar()) &&
       (GetCurrentChar()!='>') )
      {
       m_parseValue+=GetCurrentChar();
       m_idx++;
      }
     }
     EatWhiteSpace();
    }
   }

   /// <summary>
   /// Add a parsed attribute to the collection.
   /// </summary>
   public void AddAttribute()
   {
    Attribute a = new Attribute(m_parseName,
     m_parseValue,m_parseDelim);
    Add(a);
   }

   /// <summary>
   /// Get the current character that is being parsed.
   /// </summary>
   /// <returns></returns>
   public char GetCurrentChar()
   {
    return GetCurrentChar(0);
   }

   /// <summary>
   /// Get a few characters ahead of the current character.
   /// </summary>
   /// <param name="peek">How many characters to peek ahead for.</param>
   /// <returns>The character that was retrieved.</returns>
   public char GetCurrentChar(int peek)
   {
    if( (m_idx+peek)<m_source.Length )
     return m_source[m_idx+peek];
    else
     return (char)0;
   }

   /// <summary>
   /// Obtain the next character and advance the index by one.
   /// </summary>
   /// <returns>The next character</returns>
   public char AdvanceCurrentChar()
   {
    return m_source[m_idx++];
   }

   /// <summary>
   /// Move the index forward by one.
   /// </summary>
   public void Advance()
   {
    m_idx++;
   }


   /// <summary>
   /// The last attribute name that was encountered.
   /// </summary>
   public string ParseName
   {
    get
    {
     return m_parseName;
    }

    set
    {
     m_parseName = value;
    }
   }

   /// <summary>
   /// The last attribute value that was encountered.
   /// </summary>
   public string ParseValue
   {
    get
    {
     return m_parseValue;
    }

    set
    {
     m_parseValue = value;
    }
   }

   /// <summary>
   /// The last attribute delimeter that was encountered.
   /// </summary>
   public char ParseDelim
   {
    get
    {
     return m_parseDelim;
    }

    set
    {
     m_parseDelim = value;
    }
   }

   /// <summary>
   /// The text that is to be parsed.
   /// </summary>
   public string Source
   {
    get
    {
     return m_source;
    }

    set
    {
     m_source = value;
    }
   }
}

上面的是分析标签的，并且将标签的属性Parse进属性列表里。

public class ParseHTML:Parse
{
   public AttributeList GetTag()
   {
    AttributeList tag = new AttributeList();
    tag.Name = m_tag;

    foreach(Attribute x in List)
    {
     tag.Add((Attribute)x.Clone());
    }

    return tag;
   }

   public String BuildTag()
   {
    String buffer="<";
    buffer+=m_tag;
    int i=0;
    while ( this[i]!=null )
    {// has attributes
     buffer+=" ";
     if ( this[i].Value == null )
     {
      if ( this[i].Delim!=0 )
       buffer+=this[i].Delim;
      buffer+=this[i].Name;
      if ( this[i].Delim!=0 )
       buffer+=this[i].Delim;
     }
     else
     {
      buffer+=this[i].Name;
      if ( this[i].Value!=null )
      {
       buffer+="=";
       if ( this[i].Delim!=0 )
        buffer+=this[i].Delim;
       buffer+=this[i].Value;
       if ( this[i].Delim!=0 )
        buffer+=this[i].Delim;
      }
     }
     i++;
    }
    buffer+=">";
    return buffer;
   }

   protected void ParseTag()
   {
    m_tag="";
    Clear();

    // Is it a comment?
    if ( (GetCurrentChar()=='!') &&
     (GetCurrentChar(1)=='-')&&
     (GetCurrentChar(2)=='-') )
    {
     while ( !Eof() )
     {
      if ( (GetCurrentChar()=='-') &&
       (GetCurrentChar(1)=='-')&&
       (GetCurrentChar(2)=='>') )
       break;
      if ( GetCurrentChar()!='\r' )
       m_tag+=GetCurrentChar();
      Advance();
     }
     m_tag+="--";
     Advance();
     Advance();
     Advance();
     ParseDelim = (char)0;
     return;
    }

    // Find the tag name
    while ( !Eof() )
    {
     if ( IsWhiteSpace(GetCurrentChar()) || (GetCurrentChar()=='>') )
      break;
     m_tag+=GetCurrentChar();
     Advance();
    }

    EatWhiteSpace();

    // Get the attributes
    while ( GetCurrentChar()!='>' )
    {
     ParseName = "";
     ParseValue = "";
     ParseDelim = (char)0;

     ParseAttributeName();

     if ( GetCurrentChar()=='>' )
     {
      AddAttribute();
      break;
     }

     // Get the value(if any)
     ParseAttributeValue();
     AddAttribute();
    }
    Advance();
   }


   public char Parse()
   {
    if( GetCurrentChar()=='<' )
    {
     Advance();

     char ch=char.ToUpper(GetCurrentChar());
     if ( (ch>='A') && (ch<='Z') || (ch=='!') || (ch=='/') )
     {
      ParseTag();
      return (char)0;
     }
     else return(AdvanceCurrentChar());
    }
    else return(AdvanceCurrentChar());
   }
}

最后一个类是用来从HTML里分离出Tag，进而适合分析属性时使用。

通过以上4个类，将一个原本很难分析问题的变得很轻松的解决了。其实这里就透露出一种分析问题解决问题的思考过程。我想程序员应该养成的习惯就是将一个复杂的问题步步分解，分解成很多个很细小的问题，小问题解决了，这个复杂的问题也就解决了。而这些问题之间的逻辑联系如何组织，如何做到偶合性最低，是很值得思考的一个问题。现在想想以前在大学里学的软件工程，似乎可以渐渐明白了什么是健壮，什么是可复用。

hflkl1314

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
一个C#分析html的包，从老外的一个Spider程序里提取出来的

一个C#分析html的包，以前思考如何分析html，总是毫无头绪，情况太多，太复杂。后来在一个老外写的Spider程序里发现了这个。非常好用，屡试不爽，现在才知道自己写程序还在一个很低的层次，尤其是面向对象思想的理解。利用继承的思想来分解问题，利用多个类来实现多种情况的变化，这个才是OO思想，而自己很多时候只是为了OO而去OO,其实水平还是停留在面向过程里。好好分析下这些代码，受益颇多。///
复制链接

扫一扫