HttpWebResponse 抓包百度一搜一大堆,但是完整的配置和注释对新手来说还是很难得,这里我来说一下
HttpWebResponse response = null; //申明
ServicePointManager.Expect100Continue = false; 这个是可以解决有些网页返回异常的情况,
System.Net.ServicePointManager.DefaultConnectionLimit = 500; 这个主要是设置并发数,就是连接的最大数设置。多任务多线程抓取的时候还是要配置一下。
下面来说一下多网卡配置,普通机子或者是只有一张网卡的可以不用看这个,也不用配置。以下代码中info.url是网卡的ip,就是指定要走哪张网卡来抓取数据
//指定走哪张网卡
if (info.IsSxNet == false& UrlIP!=null)
{
Uri uri = new Uri(info.url);
ServicePoint sp = ServicePointManager.FindServicePoint(uri);
sp.BindIPEndPointDelegate =
(servicePoint, remoteEp, retryCount) =>
{
return new IPEndPoint(IPAddress.Parse(UrlIP), port());
};
}
初始化对象
var request = WebRequest.Create(info.url) as HttpWebRequest;
下面来配置抓取https的时候需要的设置
if (info.url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
{
ServicePointManager.ServerCertificateValidationCallback =
CheckValidationResult;
}
设置cookie,这个对要求登陆的网站是必须要设置的
if (info.Set_Cookie != null)
{
request.Headers["Cookie"] = info.Set_Cookie;
}
设置代理ip,代理ip端口号,在抓取某个网站是可能会遇到网站防抓取限制IP的情况,这时可以使用代理IP来解决,一般某宝或百度上就很多这种代理ip卖的
if (info.IsPort)
{
request.Proxy = new WebProxy() {Address = new Uri("http://" + info.Ip)};
}
设置http请求的头部,这个是除了.net提供的外一些特殊的或者商家自定义http头部参数的配置
if (info.HeadKey != null)
{
foreach (DictionaryEntry stw in info.HeadKey)
{
SetHeaderValue(request.Headers, stw.Key.ToString(), stw.Value.ToString());
// request.Headers.Set(stw.Key.ToString(), stw.Value.ToString());
}
}
//解决 “Keep-Alive 和 Close 不能使用此属性设置 的问题
public void SetHeaderValue(WebHeaderCollection header, string name, string value)
{
// typeof(WebHeaderCollection)
var property = typeof(WebHeaderCollection).GetProperty("InnerCollection",
System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic);
if (property != null)
{
var collection = property.GetValue(header, null) as NameValueCollection;
if (collection != null) collection[name] = value;
}
}
设置请求的超时时间
request.Timeout = info.OutTime != 0 ? info.OutTime : 80000;
设置是否也许跳转
request.AllowAutoRedirect = info.AllowAutoRedirect;
设置请求方式,是GET还是POST
request.Method = info.Method;
设置返回地址
request.Referer = info.Referer;
设置请求类型
request.Accept =“text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8”;
设置请求的客户端类型
request.UserAgent =“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36”;
设置请求数据的方式
request.ContentType = “application/x-www-form-urlencoded”;
判断是GET还是POST,如果是POST就如看如下代码
if (info.Method == "POST")
{
if (!string.IsNullOrEmpty(info.postdata))
{
var data = encoding.GetBytes(info.postdata);
request.ContentLength = data.Length;
var outstream = request.GetRequestStream();
outstream.Write(data, 0, data.Length);
outstream.Close();
}
else if (info.Postdata != null)
{
request.ContentLength = info.Postdata.Length;
var outstream = request.GetRequestStream();
outstream.Write(info.Postdata, 0, info.Postdata.Length);
outstream.Close();
}
}
开始发送请求
//发送请求并获取相应回应数据
response = request.GetResponse() as HttpWebResponse;
获取返回的cookie
if (info.cookieoff == 0 & response.Headers["set-cookie"] != null & response.Headers["set-cookie"] != "")
{
info.OutCookies = response.Headers["set-cookie"];
}
判断返回的 信息是否需要跳转
var instream = response.GetResponseStream();//开始读取远程URL的信息
str = new StreamReader(instream, encoding);
if (!IsNullOrEmpty(response.Headers["Location"]))
{
info.Location = response.Headers["Location"];
}
判断远程Url是否使用了压缩,如果是就解压
if (response.ContentEncoding.ToLower().Contains("gzip"))
{
using (System.IO.Compression.GZipStream stream =
new System.IO.Compression.GZipStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, encoding))
{
return reader.ReadToEnd();
}
}
}
读取返回的html信息
var hm = str.ReadToEnd();
return hm;
作者:403648571@qq.com,QQ群:695080688