HttpWebRequest采集

最新推荐文章于 2024-09-10 14:15:01 发布

soarheaven

最新推荐文章于 2024-09-10 14:15:01 发布

阅读量604

点赞数

分类专栏： Asp.Net 文章标签： http服务器 webbrowser url html c# 浏览器

Asp.Net 专栏收录该内容

66 篇文章 0 订阅

订阅专栏

其实web的采集就是要模仿出一个浏览器出来去访问http服务器，将获取到的html代码解析出来。

在C#中有多个方法来实现HTML的request，可以使用WebClient,httpwebrequest还可以使用webbrowser；

这里只说使用httpwebrequest的实现。

这里在采集上还要有一个技巧，有些网站的URL规则可能根本就不需求你先来采集分页的页面就能直接采集你想要的内容。如很多个网站都是采用ID来显示最后的路径，例：http://www.aaaa.com/a/show.php?id=111111

这样的采集就是直接访问URL就是，总之一句话，只有先取到了想要采集的页面URL才能取到想要的东西（相当的废话。。。）

不多说，先来代码

C#代码

public abstract class WebControler
{
#region ConstString
protected string sUserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
protected string sContentType = "application/x-www-form-urlencoded";
protected string sRequestEncoding = "utf-8";
protected string sResponseEncoding = "utf-8";
protected CookieContainer _cookieContainer = null;
#endregion
/// <summary>
/// 了类中去继承，根据不同网站的验证方式实现登陆验证
/// 以获取用户唯一标识的cookie
/// </summary>
protected virtual void CheckLogin()
{
_cookieContainer = new CookieContainer();
}
/// <summary>
/// 读取访问URL所返回的HTML字符串
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetHtmlFromUrl(string url)
{
CheckLogin() ;
WebRequest request = WebRequest.Create(url);
HttpWebRequest httpRequest = request as HttpWebRequest;
if (httpRequest == null)
{
throw new ApplicationException(
string.Format("Invalid url string: {0}", url)
);
}
//有些网站加入了限制,只有先从首页或验证页面访问才能访问,一般都记录到cookie中
//这里就是将验证后的cookie容器赋给采集的client
httpRequest.CookieContainer = _cookieContainer;
httpRequest.UserAgent = sUserAgent;
httpRequest.Accept = "*/*";
httpRequest.Headers.Add("Accept-Language", "zh-cn");
httpRequest.KeepAlive = true;
httpRequest.Timeout = 10000;
httpRequest.Method = "GET";
HttpWebResponse response = (HttpWebResponse)httpRequest.GetResponse();
string sResponse = string.Empty;
if (response.StatusCode == HttpStatusCode.OK)
{
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8")))
{
sResponse = reader.ReadToEnd();
reader.Close();
}
}
response.Close();
return sResponse;
}
}