网页内空为压缩行式的测试代码

            try
            {
                req = (HttpWebRequest)HttpWebRequest.Create(strUrl);

                req.UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:6.0) Gecko/20100101 Firefox/6.0";
                req.Accept = "*/*";
                req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                req.ContentType = "text/xml";
                req.Timeout = 20000;

                // 设置访问来源地址,避免被检测到非本地址访问
                if (String.IsNullOrEmpty(referer))
                {
                    req.Referer = req.RequestUri.Host;
                }
                else
                {
                    req.Referer = referer;
                }

                // 设置 cookie 验证信息
                CookieContainer cc = new CookieContainer();
                req.CookieContainer = cc;

                foreach (Cookie cook in cookies)
                {
                    Cookie c = new Cookie(cook.Name, cook.Value);
                    if (cookieDomain != null)
                        c.Domain = cookieDomain;
                    cc.Add(c);
                }

                resp = (HttpWebResponse)req.GetResponse();

                Encoding enc;

                // 解决 .NET 编码识别错误的问题
                if ("ISO-8859-1" == resp.CharacterSet)
                {
                    // 使用指定的编码格式校正
                    enc = encoding;
                }
                else
                {
                    // 自动使用识别出来的编码格式
                    enc = Encoding.GetEncoding(resp.CharacterSet);
                }

                //if (isAjax)
                //{
                //    return GetAjaxUseWebBrowser(strUrl);
                //}

                string sHTML = string.Empty;



                // 从网站Url获取内容流
                stream = resp.GetResponseStream();

                // 判断内容是否使用gzip压缩
                if (resp.ContentEncoding.ToLower().Contains("gzip"))
                {
                    // 设置gzip解压缩
                    stream = new GZipStream(stream, CompressionMode.Decompress);
                    read = new StreamReader(stream, enc);
                    sHTML = read.ReadToEnd();
                }
                else
                {
                    string sChartSet = "";
                    read = new StreamReader(stream, enc);
                    sHTML = read.ReadToEnd();

                    // 获取内容使用的编码格式
                    Match charSetMatch = Regex.Match(sHTML, "charset=(?<code>[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase);
                    sChartSet = charSetMatch.Groups["code"].Value;

                    //if it's not utf-8,we should redecode the html.
                    Regex rx = new Regex("([\u4e00-\u9fa5]{2,4})");

                    // 如果内容为UTF-8编码格式则进行再次转码
                    if (!rx.IsMatch(sHTML))
                    {
                        if (!string.IsNullOrEmpty(sChartSet.Trim()))
                            sHTML = Encoding.GetEncoding(sChartSet).GetString(enc.GetBytes(sHTML));
                    }
                }

                // 去除换行符
                sHTML = sHTML.Replace("\n", "").Replace("\r", "").Replace("\t", "");

                // 设置抓取任务状态为 1:=成功
                crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Success;

                return sHTML;
            }
            catch (Exception ex)
            {
                CommonFunction.logWirte(ex.ToString() + strUrl, LogGrade.Warning);

                if (ex.Message.ToString().IndexOf("远程服务器返回错误: (503) 服务器不可用") > -1)
                {
                    crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Forbat;
                }
                else
                {
                    crawlerState = IWOMWebCrawlerDbLayer.Common.CrawlerState.Error;
                }
            }
            finally
            {
                if (resp != null)
                {
                    resp.Close();
                }
                if (stream != null)
                {
                    stream.Close();
                }
                if (read != null)
                {
                    read.Close();
                }
            }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值