爬虫程序

        /// <summary>
        ///  获取抓取链接Html的源代码
        /// </summary>
        /// <param name="url">url地址</param>
        /// <param name="charSet">编码方式、如果传入""则自动获取编码</param>
        /// <returns></returns>
        public string GetHttpSource(string url, string charSet = "")
        {
            try
            {
                string strWebData;
                StreamReader sr;
                var myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
                myHttpWebRequest.Proxy = null;
                myHttpWebRequest.Timeout = 15 * 1000; //连接超时
                myHttpWebRequest.Accept = "*/*";
                myHttpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/536.1";
                myHttpWebRequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; //自动解压gzip
                var myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
                var stream = myHttpWebResponse.GetResponseStream();
                //stream.ReadTimeout = 15 * 1000; //读取超时 设置.AutomaticDecompression后不支持超时
                //先分析header中编码
                var hchart = myHttpWebResponse.Headers["Content-Type"];
                var hchartm = Regex.Match(hchart, "charset=(.*)?", RegexOptions.IgnoreCase);
                var hchart1 = hchartm.Groups[1].Value;
                if (hchart1 != "")
                {
                    if (stream != null) { 
                        sr = new StreamReader(stream, Encoding.GetEncoding(hchart1));
                        strWebData = sr.ReadToEnd();
                        goto endthis;
                    }
                }
                //保存到 MemoryStream 供重复读取
                var ms = new MemoryStream();
                var buffer = new byte[1024];
                while (true)
                {
                    if (stream != null)
                    {
                        var sz = stream.Read(buffer, 0, 1024);
                        if (sz == 0) break;
                        ms.Write(buffer, 0, sz);
                    }
                }
                //默认编码读取            
                ms.Position = 0;//指针置于流开头
                if (charSet == "") charSet = "gb2312";
                sr = new StreamReader(ms, Encoding.GetEncoding(charSet));
                strWebData = sr.ReadToEnd();
                //获取网页meta字符编码
                var charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                var webCharSet = charSetMatch.Groups[3].Value.ToLower();
                if (!Encoding.GetEncoding(webCharSet).Equals(Encoding.GetEncoding(charSet)) && webCharSet != "")
                {
                    ms.Position = 0;//指针置于流开头
                    sr = new StreamReader(ms, Encoding.GetEncoding(webCharSet));
                    strWebData = sr.ReadToEnd();
                }
                ms.Close();
            endthis:
                sr.Close();
                stream.Close();
                myHttpWebResponse.Close(); myHttpWebRequest.Abort();
                return strWebData;
            }
            catch (Exception ex) { return "Error:" + ex.Message; }
        }


        /// <summary>
        /// 清除内容中的Html代码
        /// </summary>
        /// <param name="Content"></param>
        /// <returns></returns>
        public string ClearHtml(string Content)
        {
            Content = ReplaceHtml("&#[^>]*;", "", Content);
            Content = ReplaceHtml("</?marquee[^>]*>", "", Content);
            Content = ReplaceHtml("</?object[^>]*>", "", Content);
            Content = ReplaceHtml("</?param[^>]*>", "", Content);
            Content = ReplaceHtml("</?embed[^>]*>", "", Content);
            Content = ReplaceHtml("</?table[^>]*>", "", Content);
            Content = ReplaceHtml(" ", "", Content);
            Content = ReplaceHtml("</?tr[^>]*>", "", Content);
            Content = ReplaceHtml("</?th[^>]*>", "", Content);
            Content = ReplaceHtml("</?p[^>]*>", "", Content);
            Content = ReplaceHtml("</?a[^>]*>", "", Content);
            Content = ReplaceHtml("</?img[^>]*>", "", Content);
            Content = ReplaceHtml("</?tbody[^>]*>", "", Content);
            Content = ReplaceHtml("</?li[^>]*>", "", Content);
            Content = ReplaceHtml("</?span[^>]*>", "", Content);
            Content = ReplaceHtml("</?div[^>]*>", "", Content);
            Content = ReplaceHtml("</?th[^>]*>", "", Content);
            Content = ReplaceHtml("</?td[^>]*>", "", Content);
            Content = ReplaceHtml("</?script[^>]*>", "", Content);
            Content = ReplaceHtml("(javascript|jscript|vbscript|vbs):", "", Content);
            Content = ReplaceHtml("on(mouse|exit|error|click|key)", "", Content);
            Content = ReplaceHtml("<\\?xml[^>]*>", "", Content);
            Content = ReplaceHtml("<\\/?[a-z]+:[^>]*>", "", Content);
            Content = ReplaceHtml("</?font[^>]*>", "", Content);
            Content = ReplaceHtml("</?b[^>]*>", "", Content);
            Content = ReplaceHtml("</?u[^>]*>", "", Content);
            Content = ReplaceHtml("</?i[^>]*>", "", Content);
            Content = ReplaceHtml("</?strong[^>]*>", "", Content);
            string clearHtml = Content;
            return clearHtml;
        }

        /// <summary>
        /// 获取Html代码中所要抓取的链接集合
        /// </summary>
        /// <param name="html"></param>
        /// <param name="url"></param>
        /// <param name="strReg"></param>
        /// <returns></returns>
        public string[] GetLinks(string html, string url,string strReg)
        {
            Collection<string> urls = new Collection<string>();
            MatchCollection matches = new Regex(strReg, RegexOptions.Singleline).Matches(html);
            var regLInk = string.Empty;
            //判断:如果Html源代码中没有 </body> 标签则证明源不完整,重新抓取
            if (!html.Contains("</body>")) 
            {
                var newContent = GetHttpSource(url, "");
                GetLinks(newContent, url, strReg);
            }
            //循环添加
            foreach (Match match in matches)
            {
                regLInk = match.Groups["key"].Value;
                urls.Add(regLInk);
            }
            return urls.ToArray();
        }


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值