if (String.IsNullOrEmpty(strUrl))
{ return "抓取地址为空!"; }
HttpWebRequest req = null;
HttpWebResponse resp = null;
Stream stream = null;
StreamReader read = null;
try
{
req = (HttpWebRequest)HttpWebRequest.Create(strUrl);
req.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1";
req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
req.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
req.ContentType = "text/html"; req.Timeout = 20000; // 设置访问来源地址,避免被检测到非本地址访问
if (String.IsNullOrEmpty(referer))
{ req.Referer = req.RequestUri.Host; }
else { req.Referer = referer; }
// 设置 cookie 验证信息
CookieContainer cc = new CookieContainer();
req.CookieContainer = cc;
foreach (Cookie cook in cookies)
{
Cookie c = new Cookie(cook.Name, cook.Value);
if (cookieDomain != null)
c.Domain = cookieDomain; cc.Add(c);
}
resp = (HttpWebResponse)req.GetResponse();
Encoding enc;
// 解决 .NET 编码识别错误的问题
if ("ISO-8859-1" == resp.CharacterSet)
{
// 使用指定的编码格式校正
enc = encoding;
} else
{
// 自动使用识别出来的编码格式
enc = Encoding.GetEncoding(resp.CharacterSet);
}
//if (isAjax) //{ // return GetAjaxUseWebBrowser(strUrl); //}
string sHTML = string.Empty;
// 从网站Url获取内容流
stream = resp.GetResponseStream();
// 判断内容是否使用gzip压缩
if (resp.ContentEncoding.ToLower().Contains("gzip"))
{
// 设置gzip解压缩
stream = new GZipStream(stream, CompressionMode.Decompress);
read = new StreamReader(stream, enc);
sHTML = read.ReadToEnd();
}
else
{
string sChartSet = "";
read = new StreamReader(stream, enc);
sHTML = read.ReadToEnd();
// 获取内容使用的编码格式
Match charSetMatch = Regex.Match(sHTML, "charset=(?<code>[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase);
sChartSet = charSetMatch.Groups["code"].Value;
//if it's not utf-8,we should redecode the html.
Regex rx = new Regex("([\u4e00-\u9fa5]{2,4})");
// 如果内容为UTF-8编码格式则进行再次转码
if (!rx.IsMatch(sHTML))
{
if (!string.IsNullOrEmpty(sChartSet.Trim()))
sHTML = Encoding.GetEncoding(sChartSet).GetString(enc.GetBytes(sHTML));
}
}
// 去除换行符
sHTML = sHTML.Replace("\n", "").Replace("\r", "").Replace("\t", "");
// 设置抓取任务状态为 1:=成功
crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Success; return sHTML;
}
catch (Exception ex)
{ }
return "";
网页内容若使用gzip压缩--获得页面源码
最新推荐文章于 2017-11-23 19:31:01 发布