将Html原码解析成IHTMLDocumet对象,然后使用DOMNode将html显示成一棵树

功能:

    将Html原码解析成IHTMLDocumet2对象,然后将IHTMLDocumet2转换成IHTMLDocumet3,使用DOMNode,将html显示成一棵树。此解析不执行任何脚本,不从网上下载任何资料,是一个纯文本的解析。      

    (方法 Parse(string str) 一个轻量级Parsing 实现。这个代码不会从网上下载任何资料,也不会执行任何脚本,纯属Parsing。
Parsing是通过MSHTML的Markup Service实现的。要正确使用这个代码,需要添加MSHTML引用。)

      要正确编译如下代码,还需要修改unsafe(启用不安全模式)编译器选项,将其开启。

方法:在“项目”->“<应用程序名称>属性”对话框中打开“配置属性”,选中“生成”项,修改“允许不安全代码块”的内容为true.

[C#]

using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using mshtml;
using System.Runtime.InteropServices;
using System.IO;
 
namespace WindowsApplication1
{
  
 [ComVisible(true), ComImport(), Guid("7FD52380-4E07-101B-AE2D-08002B2EC713") , InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
 public interface IPersistStreamInit 
 {
  void GetClassID([In, Out] ref Guid pClassID);
  [return: MarshalAs(UnmanagedType.I4)] [PreserveSig]
  int IsDirty();
  void Load([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
  void Save([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm, 
   [In, MarshalAs(UnmanagedType.I4)] int fClearDirty);
  void GetSizeMax([Out, MarshalAs(UnmanagedType.LPArray)] long pcbSize);
  void InitNew();
 } 
  /// <summary>
 /// Form1 的摘要说明。
 /// </summary>
 public class Form1 : System.Windows.Forms.Form
 {
  private System.Windows.Forms.Button button1;
  private System.Windows.Forms.TreeView treeView1;
  /// <summary>
  /// 必需的设计器变量。
  /// </summary>
  private System.ComponentModel.Container components = null;

  public Form1()
  {
   //
   // Windows 窗体设计器支持所必需的
   //
   InitializeComponent();

   //
   // TODO: 在 InitializeComponent 调用后添加任何构造函数代码
   //
  }

  /// <summary>
  /// 清理所有正在使用的资源。
  /// </summary>
  protected override void Dispose( bool disposing )
  {
   if( disposing )
   {
    if (components != null)
    {
     components.Dispose();
    }
   }
   base.Dispose( disposing );
  }

  #region Windows 窗体设计器生成的代码
  /// <summary>
  /// 设计器支持所需的方法 - 不要使用代码编辑器修改
  /// 此方法的内容。
  /// </summary>
  private void InitializeComponent()
  {
   this.button1 = new System.Windows.Forms.Button();
   this.treeView1 = new System.Windows.Forms.TreeView();
   this.SuspendLayout();
   //
   // button1
   //
   this.button1.Location = new System.Drawing.Point(24, 16);
   this.button1.Name = "button1";
   this.button1.Size = new System.Drawing.Size(88, 24);
   this.button1.TabIndex = 0;
   this.button1.Text = "button1";
   this.button1.Click += new System.EventHandler(this.button1_Click);
   //
   // treeView1
   //
   this.treeView1.ImageIndex = -1;
   this.treeView1.Location = new System.Drawing.Point(280, 96);
   this.treeView1.Name = "treeView1";
   this.treeView1.SelectedImageIndex = -1;
   this.treeView1.Size = new System.Drawing.Size(288, 224);
   this.treeView1.TabIndex = 1;
   //
   // Form1
   //
   this.AutoScaleBaseSize = new System.Drawing.Size(6, 14);
   this.ClientSize = new System.Drawing.Size(664, 333);
   this.Controls.Add(this.treeView1);
   this.Controls.Add(this.button1);
   this.Name = "Form1";
   this.Text = "Form1";
   this.ResumeLayout(false);

  }
  #endregion

  /// <summary>
  /// 应用程序的主入口点。
  /// </summary>
  [STAThread]
  static void Main()
  {
   Application.Run(new Form1());
  }
  unsafe IHTMLDocument2  Parse(string s)
  {
   IHTMLDocument2 pDocument=new HTMLDocumentClass();  
   if(pDocument!=null)
   {
    IPersistStreamInit pPersist=pDocument as IPersistStreamInit ;
    pPersist.InitNew();
    pPersist=null;
    IMarkupServices ms=pDocument as IMarkupServices ;
    if(ms!=null)
    {
     IMarkupContainer pMC=null;
     IMarkupPointer pStart,pEnd;
     ms.CreateMarkupPointer(out pStart);
     ms.CreateMarkupPointer(out pEnd);
     System.Text.StringBuilder sb=new System.Text.StringBuilder(s); 
     IntPtr pSource=Marshal.StringToHGlobalUni(s);
     ms.ParseString(ref *(ushort*)pSource.ToPointer(),0,out pMC,pStart,pEnd);
     if(pMC!=null)
     {
      Marshal.Release(pSource);
      return pMC as IHTMLDocument2;
     }
     Marshal.Release(pSource);
    }
   }
   return null;
  }

  private void button1_Click(object sender, System.EventArgs e)
  {
   

   string html="";
   string filename="D://NetC#Program//html//163.htm";
   if (!File.Exists(filename))
   {
    Console.WriteLine("文件不存在");
    return;
   }
   
   StreamReader sr1 = new StreamReader(
    (System.IO.Stream)File.OpenRead(filename),System.Text.Encoding.Default);
   html="";
   while (sr1.Peek()>-1)
   {
    html=html+sr1.ReadToEnd();
   }
   sr1.Close();

   IHTMLDocument2 doc2 = Parse(html);

   Console.WriteLine(doc2.styleSheets.length);
   IHTMLDocument3 HTMLDocument=(IHTMLDocument3)doc2;
   IHTMLDOMNode rootDomNode=(IHTMLDOMNode)HTMLDocument.documentElement;
   TreeNode root=treeView1.Nodes.Add("HTML");
   InsertDOMNodes(rootDomNode,root);
   

  }
  private void InsertDOMNodes(IHTMLDOMNode parentnode,TreeNode tree_node)
  {
   
   if(parentnode.hasChildNodes())//是否有子结点
   {
    IHTMLDOMChildrenCollection allchild = (IHTMLDOMChildrenCollection)parentnode.childNodes;
    int length = allchild.length;
    for(int i=0;i<length;i++)//对每个子结点进行处理,首先取出每个子节点的属性,然后进行递归
    {
     IHTMLDOMNode child_node = (IHTMLDOMNode)allchild.item(i);
     string m_snodeName  =child_node.nodeName;
     object m_onodevalue =child_node.nodeValue;
     string m_snodetype  =child_node.nodeType.ToString();
     string m_snodevalue ="";
     if ( m_onodevalue!=null)
      m_snodevalue =m_onodevalue.ToString().Trim();
     TreeNode tempnode=null;
     
     if (child_node.nodeName.Equals("#text"))
     {
      if ((m_snodevalue!=null)&& (!m_snodevalue.Equals("")))
      {
       tempnode = tree_node.Nodes.Add(m_snodevalue);
       
      }
      
     }
     else
     {
      tempnode = tree_node.Nodes.Add(child_node.nodeName);
     
      InsertDOMNodes(child_node,tempnode);
     }
    }
   }
  
  }
 }
}

阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭