引用:
http://www.cnblogs.com/gaoweipeng/archive/2009/09/02/1558279.html
介绍了两种不错的HTML解析器的方法。
第一种方法:HTML正则表达式的方法。
参见:http://www.cnblogs.com/gaoweipeng/archive/2009/09/02/1558279.html/
或者直接解析标签,以前我曾经使用DELPHI就是直接解析,C#也有。
参考网页:
http://www.codeproject.com/Articles/57176/Parsing-HTML-Tags-in-C
第二种方法:采用.NET自带的WebBrowser结合HtmlDocument进行解析。
方法问题在于依赖于WEBBrowser.
源代码:
using System;
using System.Collections;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.Web;
using System.IO;
namespace IntrospectionHtml
{
public partial class Form1 : Form
{
public class TreeNodeHtmlElm : TreeNode
{
//属性节点
public enum TypeNode
{
Html,
Form,
Link,
Image,
Css
};
public HtmlElement mHtmlElement;
private TypeNode type;
public TypeNode Type
{
get { return type; }
set { type = value; }
}
public TreeNodeHtmlElm(string elm,TypeNode t)
: base()
{
type = t;
Text = elm;
mHtmlElement = null;
}
public TreeNodeHtmlElm(HtmlElement elm,string txt,TypeNode t)
: base()
{
type = t;
Text = txt;
mHtmlElement = elm;
}
public TreeNodeHtmlElm(HtmlElement elm,TypeNode t) : base()
{
type = t;
mHtmlElement = elm;
try
{
if (elm.OuterText == null || elm.OuterText == "")
{
Text = elm.OuterHtml;
}
else
{
if (elm.OuterText.Length > 100)
{
Text = elm.OuterText.Substring(0, 100);
}
else
{
Text = elm.OuterText;
}
}
}
catch (Exception e)
{
Text = "";
}
}
};
TreeNodeHtmlElm nodeCss = null;
public Form1()
{
InitializeComponent();
webBrowser2.Navigate("about:blank");
splitContainer1.SplitterDistance = 50;
}
private void FillTree(HtmlElement hElmFather, TreeNodeHtmlElm t,TreeNodeHtmlElm.TypeNode type)
{
foreach (HtmlElement hElm in hElmFather.Children)
{
TreeNodeHtmlElm node = new TreeNodeHtmlElm(hElm,type);
t.Nodes.Add((TreeNode)node);
if (hElm.Children.Count > 0)
{
FillTree(hElm, node,type);
}
}
}
private void FillTreeForm(HtmlDocument doc, TreeNodeHtmlElm t)
{
System.Collections.IEnumerator en = doc.Forms.GetEnumerator();
while (en.MoveNext())
{
FillTree((HtmlElement)en.Current,t,TreeNodeHtmlElm.TypeNode.Form);
}
}
private void FillTreeLink(HtmlDocument doc, TreeNodeHtmlElm t)
{
List<String> lstTemp = new List<String>();
foreach (HtmlElement e in doc.Links)
{
string textToAdd = e.GetAttribute("href");
//
// On elimine les doublons
//
if (lstTemp.IndexOf(textToAdd) == -1)
{
TreeNodeHtmlElm node = new TreeNodeHtmlElm(e, e.GetAttribute("href"), TreeNodeHtmlElm.TypeNode.Link);
t.Nodes.Add((TreeNode)node);
lstTemp.Add(textToAdd);
}
}
}
private void FillTreeImage(HtmlDocument doc, TreeNodeHtmlElm t)
{
List<String> lstTemp = new List<String>();
foreach (HtmlElement e in doc.Images)
{
string textToAdd = e.GetAttribute("src");
//
// On elimine les doublons
//
if (lstTemp.IndexOf(textToAdd) == -1 )
{
TreeNodeHtmlElm node = new TreeNodeHtmlElm(e, textToAdd,TreeNodeHtmlElm.TypeNode.Image);
t.Nodes.Add((TreeNode)node);
lstTemp.Add(textToAdd);
}
}
}
private void FillTreeCss(HtmlDocument doc, TreeNodeHtmlElm t)
{
foreach (HtmlElement e in doc.All)
{
if(e.TagName.ToLower() == "link")
{
if (e.GetAttribute("rel").ToLower() == "stylesheet")
{
TreeNodeHtmlElm node = new TreeNodeHtmlElm(e, e.GetAttribute("href"),TreeNodeHtmlElm.TypeNode.Css);
t.Nodes.Add((TreeNode)node);
}
}
}
}
private void ShowInHtmlPreview()
{
TreeNodeHtmlElm tn = (TreeNodeHtmlElm)(treeView1.SelectedNode);
if (tn != null)
{
try
{
webBrowser2.Document.Body.InnerHtml = "<html><body>" + tn.mHtmlElement.InnerHtml + "</body></html>";
}
catch (Exception exp)
{
}
}
}
private void SaveTreeNodeHtml(string filename)
{
// Mettre ceci dans un objet.
TreeNodeHtmlElm tn = (TreeNodeHtmlElm)(treeView1.SelectedNode);
StreamWriter sw = new StreamWriter(saveFileDialog1.FileName);
sw.WriteLine("<html>\n\r<body>\n\r");
// Ajouter le CSS dans le code html et recopier aussi le fichier.
// nodeCss
foreach (TreeNode e in nodeCss.Nodes)
{
sw.WriteLine("<link rel=\"stylesheet\" href=\"" + ((TreeNodeHtmlElm)e).Text +"\" type=\"text/css\" media=\"screen\" />");
}
sw.WriteLine(tn.mHtmlElement.InnerHtml);
sw.WriteLine("</body></html>");
sw.Close();
sw.Dispose();
}
private void button1_Click(object sender, EventArgs e)
{
webBrowser1.Navigate(textBox1.Text);
webBrowser2.Navigate(textBox1.Text);
splitContainer3.Panel2Collapsed = true;
}
private void treeView1_AfterSelect(object sender, TreeViewEventArgs e)
{
splitContainer3.Panel2Collapsed = false;
TreeNodeHtmlElm tn = (TreeNodeHtmlElm)(treeView1.SelectedNode);
propertyGrid1.SelectedObject = tn.mHtmlElement;
ShowInHtmlPreview();
}
private void buttonColapsePropertyGrid_Click(object sender, EventArgs e)
{
splitContainer3.Panel2Collapsed = !splitContainer3.Panel2Collapsed;
}
private void saveToolStripMenuItem_Click(object sender, EventArgs e)
{
if (treeView1.SelectedNode == null)
{
return;
}
if (saveFileDialog1.ShowDialog() == DialogResult.OK)
{
SaveTreeNodeHtml(saveFileDialog1.FileName);
}
}
private void showInHtmlToolStripMenuItem_Click(object sender, EventArgs e)
{
ShowInHtmlPreview();
}
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
treeView1.Nodes.Clear();
TreeNodeHtmlElm node = new TreeNodeHtmlElm("body",TreeNodeHtmlElm.TypeNode.Html);
treeView1.Nodes.Add((TreeNode)node);
FillTree(webBrowser1.Document.Body, node,TreeNodeHtmlElm.TypeNode.Html);
TreeNodeHtmlElm nodeForm = new TreeNodeHtmlElm("forms",TreeNodeHtmlElm.TypeNode.Form);
treeView1.Nodes.Add((TreeNode)nodeForm);
FillTreeForm(webBrowser1.Document, nodeForm);
TreeNodeHtmlElm nodeLink = new TreeNodeHtmlElm("Links",TreeNodeHtmlElm.TypeNode.Link);
treeView1.Nodes.Add((TreeNode)nodeLink);
FillTreeLink(webBrowser1.Document, nodeLink);
TreeNodeHtmlElm nodeImg = new TreeNodeHtmlElm("Images",TreeNodeHtmlElm.TypeNode.Image);
treeView1.Nodes.Add((TreeNode)nodeImg);
FillTreeImage(webBrowser1.Document, nodeImg);
nodeCss = new TreeNodeHtmlElm("CSS",TreeNodeHtmlElm.TypeNode.Css);
treeView1.Nodes.Add((TreeNode)nodeCss);
FillTreeCss(webBrowser1.Document, nodeCss);
}
private void textBox1_Validated(object sender, EventArgs e)
{
button1_Click(sender, e);
}
private void contextMenuStrip1_Opening(object sender, CancelEventArgs e)
{
if (treeView1.SelectedNode == null)
{
contextMenuStrip1.Enabled = false;
return;
}
contextMenuStrip1.Enabled = true;
switch (((TreeNodeHtmlElm)treeView1.SelectedNode).Type)
{
case TreeNodeHtmlElm.TypeNode.Html:
break;
case TreeNodeHtmlElm.TypeNode.Form:
break;
case TreeNodeHtmlElm.TypeNode.Css:
break;
case TreeNodeHtmlElm.TypeNode.Image:
break;
case TreeNodeHtmlElm.TypeNode.Link:
break;
}
}
}
}
第三种方法:第三方开源组件。
Winista.Htmlparser.Net,源代码参考引用的地址。
http://www.cnblogs.com/gaoweipeng/archive/2009/09/02/1558279.html
HtmlAgilityPack。
怎么下载呢,百度和谷歌了。。
我们比较一下。
Winista.HTMLParser比较庞大,还要引用CSharp.zip包,功能强大,提供了HTTP协议支持。
HTMLAgility.Pack比较轻量,而且时间速度比Winista解析快5倍左右,这个可能是轻量的好处。
我试用同样一个页面134的HTML页面。
Winista.HTMLParser:大约39000毫秒。
HTMLAgilityPack:大约7800毫秒.
但是,HTMLAgilityPack对于中文支持差一些,BUG还不少,希望逐步改进。
单纯做网络爬虫,网址解析网页。HTMLAgilityPack比Winista.HTMLParser好很多,而且足够用了,压缩,没有必要吧。
强烈推荐HtmlAgilityPack.