爬虫 ,又称蜘蛛,是从别的网站抓取资源的一种方法, C# .NET 使用爬虫 的方法如下:
protected string GetPageHtml (string url )
{
string pageinfo ;
try
{
WebRequest myreq = WebRequest .Create (url );
WebResponse myrep = myreq .GetResponse ();
StreamReader reader = new StreamReader (myrep .GetResponseStream (), Encoding .GetEncoding (“gb2312″ ));
pageinfo = reader .ReadToEnd ();
}
catch
{
pageinfo = “” ;
}
return pageinfo ;
}
按上述方法就可以在程序中获取某URL 的页面源文件。
但是有些网站屏蔽了爬虫 ,那就需要模拟浏览器获取的方法来进行,具体代码如下:
protected string GetPageHtml (string url )
{
string pageinfo ;
try
{
HttpWebRequest myReq = (HttpWebRequest )HttpWebRequest .Create (url );
myReq .Accept = “image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*” ;
myReq .UserAgent = “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)” ;
HttpWebResponse myRep = (HttpWebResponse )myReq .GetResponse ();
Stream myStream = myRep .GetResponseStream ();
StreamReader sr = new StreamReader (myStream , Encoding .Default );
pageinfo = sr .ReadToEnd ().ToString ();
}
catch
{
pageinfo = “” ;
}
return pageinfo ;
}