用C#使用HtmlParser.NET的例子。

 
using System;
using System.IO;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;

private void button1_Click( object sender, EventArgs e)
{
    
// we can use the stream to load a html file from the local disk
    
// or use the uri to load a web page from the internet
    
// byte[] htmlBytes = Encoding.UTF8.GetBytes(this.textBox1.Text);
    
// MemoryStream memsteam = new MemoryStream(htmlBytes);
    
// InputStreamSource input = new InputStreamSource(memsteam, "utf-8");
    
// Page page = new Page(input);
    
// Lexer lex = new Lexer(page);

    
if ( this .textBox1.Text.Length <= 0 )
        
return ;
    
// here I read the html from the textbox
     Lexer lexer = new Lexer( this .textBox1.Text);
     Parser parser
= new Parser(lexer);
     NodeList htmlNodes
= parser.Parse( null );
    
this .treeView1.Nodes.Clear();
    
this .treeView1.Nodes.Add( " root " );
     TreeNode treeRoot
= this .treeView1.Nodes[ 0 ];
    
for ( int i = 0 ; i < htmlNodes.Count; i ++ )
     {
        
this .RecursionHtmlNode(treeRoot, htmlNodes[i], false );
     }
}

private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
    
if (htmlNode == null || treeNode == null ) return ;

     TreeNode current
= treeNode;
    
// current node
     if (htmlNode is ITag)
     {
         ITag tag
= (htmlNode as ITag);
        
if ( ! tag.IsEndTag())
         {
            
string nodeString = tag.TagName;
            
if (tag.Attributes != null && tag.Attributes.Count > 0 )
             {
                
if (tag.Attributes[ " ID " ] != null )
                     nodeString
= nodeString + " { id=\ "" + tag.Attributes[ " ID " ].ToString() + " \ " } " ;
                
if (tag.Attributes[ " CLASS " ] != null )
                     nodeString
= nodeString + " { class=\ "" + tag.Attributes[ " CLASS " ].ToString() + " \ " } " ;
                
if (tag.Attributes[ " STYLE " ] != null )
                     nodeString
= nodeString + " { style=\ "" + tag.Attributes[ " STYLE " ].ToString() + " \ " } " ;
                
if (tag.Attributes[ " HREF " ] != null )
                     nodeString
= nodeString + " { href=\ "" + tag.Attributes[ " HREF " ].ToString() + " \ " } " ;
             }
             current
= new TreeNode(nodeString);
             treeNode.Nodes.Add(current);
         }
     }

    
// the children nodes
     if (htmlNode.Children != null && htmlNode.Children.Count > 0 )
     {
        
this .RecursionHtmlNode(current, htmlNode.FirstChild, true );
     }

    
// the sibling nodes
     if (siblingRequired)
     {
         INode sibling
= htmlNode.NextSibling;
        
while (sibling != null )
         {
            
this .RecursionHtmlNode(treeNode, sibling, false );
             sibling
= sibling.NextSibling;
         }
     }
}

    screen snapshot for the example:
   
    The fault tolerance of the parser is very good, as shown in the pic below (although it could do this more intelligently, I really think that's enough for use):

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值