try
{
req = (HttpWebRequest)HttpWebRequest.Create(strUrl);
req.UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:6.0) Gecko/20100101 Firefox/6.0";
req.Accept = "*/*";
req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
req.ContentType = "text/xml";
req.Timeout = 20000;
// 设置访问来源地址,避免被检测到非本地址访问
if (String.IsNullOrEmpty(referer))
{
req.Referer = req.RequestUri.Host;
}
else
{
req.Referer = referer;
}
// 设置 cookie 验证信息
CookieContainer cc = new CookieContainer();
req.CookieContainer = cc;
foreach (Cookie cook in cookies)
{
Cookie c = new Cookie(cook.Name, cook.Value);
if (cookieDomain != null)
c.Domain = cookieDomain;
cc.Add(c);
}
resp = (HttpWebResponse)req.GetResponse();
Encoding enc;
// 解决 .NET 编码识别错误的问题
if ("ISO-8859-1" == resp.CharacterSet)
{
// 使用指定的编码格式校正
enc = encoding;
}
else
{
// 自动使用识别出来的编码格式
enc = Encoding.GetEncoding(resp.CharacterSet);
}
//if (isAjax)
//{
// return GetAjaxUseWebBrowser(strUrl);
//}
string sHTML = string.Empty;
// 从网站Url获取内容流
stream = resp.GetResponseStream();
// 判断内容是否使用gzip压缩
if (resp.ContentEncoding.ToLower().Contains("gzip"))
{
// 设置gzip解压缩
stream = new GZipStream(stream, CompressionMode.Decompress);
read = new StreamReader(stream, enc);
sHTML = read.ReadToEnd();
}
else
{
string sChartSet = "";
read = new StreamReader(stream, enc);
sHTML = read.ReadToEnd();
// 获取内容使用的编码格式
Match charSetMatch = Regex.Match(sHTML, "charset=(?<code>[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase);
sChartSet = charSetMatch.Groups["code"].Value;
//if it's not utf-8,we should redecode the html.
Regex rx = new Regex("([\u4e00-\u9fa5]{2,4})");
// 如果内容为UTF-8编码格式则进行再次转码
if (!rx.IsMatch(sHTML))
{
if (!string.IsNullOrEmpty(sChartSet.Trim()))
sHTML = Encoding.GetEncoding(sChartSet).GetString(enc.GetBytes(sHTML));
}
}
// 去除换行符
sHTML = sHTML.Replace("\n", "").Replace("\r", "").Replace("\t", "");
// 设置抓取任务状态为 1:=成功
crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Success;
return sHTML;
}
catch (Exception ex)
{
CommonFunction.logWirte(ex.ToString() + strUrl, LogGrade.Warning);
if (ex.Message.ToString().IndexOf("远程服务器返回错误: (503) 服务器不可用") > -1)
{
crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Forbat;
}
else
{
crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Error;
}
}
finally
{
if (resp != null)
{
resp.Close();
}
if (stream != null)
{
stream.Close();
}
if (read != null)
{
read.Close();
}
}
网页内空为压缩行式的测试代码
最新推荐文章于 2023-08-14 15:30:02 发布