前几天我在博客园转载了一篇"net HtmlParser初步使用研究"的文章,之后看过文章评论才知道原来现在已经有HtmlAgilityPack这个类库可以更方便地对HTML内容进行分析和提取。因此今天特别学习和实践了一下HtmlAgilityPack和XPath,并作下笔记。
HtmlAgilityPack是用C#写的开源Html Parser。不过可能在某些方面设计不尽完善,或者是对中文的支持并不是很好,例如,抓取中文页面时就出现乱码.
我的代码如下:
HtmlWeb hw
=
new
HtmlWeb();
HtmlDocument doc = hw.Load( " http://www.xinlg.com " );
//HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes( " //a " );
HtmlDocument doc = hw.Load( " http://www.xinlg.com " );
//HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes( " //a " );
Response.Write(doc);
页面输出时,全部是乱码,不过有些页面又是正常的,cnblogs是正常,新浪,QQ,新华网等都是乱码,这可能跟编码有关.
具体的开源代码我就没有去看了,我的解决方案就是自己写一个获取Html的方法.
代码如下:
///
<summary>
/// 获取指定URL的HTML源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding"> 如果为NULL 则自动识别 </param>
/// <returns></returns>
public static string GetWebHtml( string url, Encoding encoding)
{
try
{
HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(url);
HttpWebResponse res;
try
{
res = (HttpWebResponse)hwr.GetResponse();
}
catch
{
return string .Empty;
}
if (res.StatusCode == HttpStatusCode.OK)
{
using (Stream mystream = res.GetResponseStream())
{
// 没有指定编码,
if (encoding == null )
{
return DecodeData(mystream, res);
}
// 指定了编码
else
{
using (StreamReader reader = new StreamReader(mystream, encoding))
{
return reader.ReadToEnd();
}
}
}
}
return null ;
}
catch
{
return null ;
}
}
private static string DecodeData(Stream responseStream, HttpWebResponse response)
{
string name = null ;
string text2 = response.Headers[ " content-type " ];
if (text2 != null )
{
int index = text2.IndexOf( " charset= " );
if (index != - 1 )
{
name = text2.Substring(index + 8 );
}
}
MemoryStream stream = new MemoryStream();
byte [] buffer = new byte [ 0x400 ];
for ( int i = responseStream.Read(buffer, 0 , buffer.Length); i > 0 ; i = responseStream.Read(buffer, 0 , buffer.Length))
{
stream.Write(buffer, 0 , i);
}
responseStream.Close();
if (name == null )
{
MemoryStream stream3 = stream;
stream3.Seek(( long ) 0 , SeekOrigin.Begin);
string text3 = new StreamReader(stream3, Encoding.ASCII).ReadToEnd();
if (text3 != null )
{
int startIndex = text3.IndexOf( " charset= " );
int num4 = - 1 ;
if (startIndex != - 1 )
{
num4 = text3.IndexOf( " \" " , startIndex);
if (num4 != - 1 )
{
int num5 = startIndex + 8 ;
name = text3.Substring(num5, (num4 - num5) + 1 ).TrimEnd( new char [] { ' > ' , ' " ' });
}
}
}
}
Encoding aSCII = null ;
if (name == null )
{
aSCII = Encoding.GetEncoding( " gb2312 " );
}
else
{
try
{
if (name == " GBK " )
{
name = " GB2312 " ;
}
aSCII = Encoding.GetEncoding(name);
}
catch
{
aSCII = Encoding.GetEncoding( " gb2312 " );
}
}
stream.Seek(( long ) 0 , SeekOrigin.Begin);
StreamReader reader2 = new StreamReader(stream, aSCII);
return reader2.ReadToEnd();
}
/// 获取指定URL的HTML源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding"> 如果为NULL 则自动识别 </param>
/// <returns></returns>
public static string GetWebHtml( string url, Encoding encoding)
{
try
{
HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(url);
HttpWebResponse res;
try
{
res = (HttpWebResponse)hwr.GetResponse();
}
catch
{
return string .Empty;
}
if (res.StatusCode == HttpStatusCode.OK)
{
using (Stream mystream = res.GetResponseStream())
{
// 没有指定编码,
if (encoding == null )
{
return DecodeData(mystream, res);
}
// 指定了编码
else
{
using (StreamReader reader = new StreamReader(mystream, encoding))
{
return reader.ReadToEnd();
}
}
}
}
return null ;
}
catch
{
return null ;
}
}
private static string DecodeData(Stream responseStream, HttpWebResponse response)
{
string name = null ;
string text2 = response.Headers[ " content-type " ];
if (text2 != null )
{
int index = text2.IndexOf( " charset= " );
if (index != - 1 )
{
name = text2.Substring(index + 8 );
}
}
MemoryStream stream = new MemoryStream();
byte [] buffer = new byte [ 0x400 ];
for ( int i = responseStream.Read(buffer, 0 , buffer.Length); i > 0 ; i = responseStream.Read(buffer, 0 , buffer.Length))
{
stream.Write(buffer, 0 , i);
}
responseStream.Close();
if (name == null )
{
MemoryStream stream3 = stream;
stream3.Seek(( long ) 0 , SeekOrigin.Begin);
string text3 = new StreamReader(stream3, Encoding.ASCII).ReadToEnd();
if (text3 != null )
{
int startIndex = text3.IndexOf( " charset= " );
int num4 = - 1 ;
if (startIndex != - 1 )
{
num4 = text3.IndexOf( " \" " , startIndex);
if (num4 != - 1 )
{
int num5 = startIndex + 8 ;
name = text3.Substring(num5, (num4 - num5) + 1 ).TrimEnd( new char [] { ' > ' , ' " ' });
}
}
}
}
Encoding aSCII = null ;
if (name == null )
{
aSCII = Encoding.GetEncoding( " gb2312 " );
}
else
{
try
{
if (name == " GBK " )
{
name = " GB2312 " ;
}
aSCII = Encoding.GetEncoding(name);
}
catch
{
aSCII = Encoding.GetEncoding( " gb2312 " );
}
}
stream.Seek(( long ) 0 , SeekOrigin.Begin);
StreamReader reader2 = new StreamReader(stream, aSCII);
return reader2.ReadToEnd();
}
我测试过了,大型网站都没有问题.
最后再调用时,代码如下:
string
Html
=
XINLG.Labs.Utils.NetUtil.GetWebHtml(
"
http://www.cnblogs.com/pick/
"
,
null
);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(Html);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(Html);
在上面的代码中,如果你不知道指定的url编码是什么,直接写NULL,自动判断.
另外,网上还介绍了方法,可以去 HtmlAgilityPack的官网下载源代码,打开项目,自己改一下,代码我就不提供了,大家可以自己搜索一下.