无视网页编码获得Html的一个方法


最近在写一个比较简单的分布式爬虫,爬取的网页编码各式各样,所以写出了一套方法,用来获取Html



下面上代码



/// <summary>
        /// 获取网页编码并输出内容
        /// </summary>
        /// <param name="url">url</param>
        /// <param name="encode">回传编码</param>
        /// <returns>html</returns>
        public static string GetDataFromUrl(string url, ref Encoding encode)
        {

            try
            {
                string str = string.Empty;
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
                //设置http头
                request.AllowAutoRedirect = true;
                request.AllowWriteStreamBuffering = true;
                request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.162 Safari/535.19";
                request.Method = "GET";
                request.Timeout = 10 * 1000;
                HttpWebResponse response = null;
                response = (HttpWebResponse)request.GetResponse();

                //根据http应答的http头来判断编码
                string characterSet = response.CharacterSet;
                //Encoding encode;
                if (characterSet != "")
                {
                    if (characterSet == "ISO-8859-1")
                    {
                        characterSet = "gb2312";
                    }
                    encode = Encoding.GetEncoding(characterSet);
                }
                else
                {
                    encode = Encoding.Default;
                }
                //声明一个内存流来保存http应答流
                Stream receiveStream = response.GetResponseStream();
                MemoryStream mStream = new MemoryStream();
                byte[] bf = new byte[255];
                int count = receiveStream.Read(bf, 0, 255);
                while (count > 0)
                {
                    mStream.Write(bf, 0, count);
                    count = receiveStream.Read(bf, 0, 255);
                }
                receiveStream.Close();
                mStream.Seek(0, SeekOrigin.Begin);
                //从内存流里读取字符串
                StreamReader reader = new StreamReader(mStream, encode);
                char[] buffer = new char[1024];
                count = reader.Read(buffer, 0, 1024);
                while (count > 0)
                {
                    str += new String(buffer, 0, count);
                    count = reader.Read(buffer, 0, 1024);
                }
                //从解析出的字符串里判断charset,如果和http应答的编码不一直
                //那么以页面声明的为准,再次从内存流里重新读取文本
                Regex reg =
                   new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>",
                              RegexOptions.Multiline | RegexOptions.IgnoreCase);
                MatchCollection mc = reg.Matches(str);
                if (mc.Count > 0)
                {
                    string tempCharSet = mc[0].Result("$1");
                    if (string.Compare(tempCharSet, characterSet, true) != 0)
                    {
                        encode = Encoding.GetEncoding(tempCharSet);
                        str = string.Empty;
                        mStream.Seek(0, SeekOrigin.Begin);
                        reader = new StreamReader(mStream, encode);
                        buffer = new char[255];
                        count = reader.Read(buffer, 0, 255);
                        while (count > 0)
                        {
                            str += new String(buffer, 0, count);
                            count = reader.Read(buffer, 0, 255);
                        }
                    }
                }
                reader.Close();
                mStream.Close();
                if (response != null)
                    response.Close();
                return str;
            }
            catch (Exception ex)
            {
                if (s)
                {
                    Console.WriteLine(ex.Message);
                    StreamWriter st = new StreamWriter("err.dst", true);
                    st.WriteLine(url);
                    st.WriteLine(ex.ToString());
                    st.Close();
                    s = false;
                    //System.Threading.Thread.Sleep(100000);
                    GetDataFromUrl(url, ref encode);
                }
            }
            return "";

        }


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值