爬虫程序

最新推荐文章于 2023-08-23 23:55:29 发布

惟楚有才

最新推荐文章于 2023-08-23 23:55:29 发布

阅读量746

点赞数

分类专栏：技术文章

本文链接：https://blog.csdn.net/qq289523052/article/details/25101583

版权

技术文章专栏收录该内容

58 篇文章 0 订阅

订阅专栏

        /// <summary>
        ///  获取抓取链接Html的源代码
        /// </summary>
        /// <param name="url">url地址</param>
        /// <param name="charSet">编码方式、如果传入""则自动获取编码</param>
        /// <returns></returns>
        public string GetHttpSource(string url, string charSet = "")
        {
            try
            {
                string strWebData;
                StreamReader sr;
                var myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
                myHttpWebRequest.Proxy = null;
                myHttpWebRequest.Timeout = 15 * 1000; //连接超时
                myHttpWebRequest.Accept = "*/*";
                myHttpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/536.1";
                myHttpWebRequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; //自动解压gzip
                var myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
                var stream = myHttpWebResponse.GetResponseStream();
                //stream.ReadTimeout = 15 * 1000; //读取超时 设置.AutomaticDecompression后不支持超时
                //先分析header中编码
                var hchart = myHttpWebResponse.Headers["Content-Type"];
                var hchartm = Regex.Match(hchart, "charset=(.*)?", RegexOptions.IgnoreCase);
                var hchart1 = hchartm.Groups[1].Value;
                if (hchart1 != "")
                {
                    if (stream != null) { 
                        sr = new StreamReader(stream, Encoding.GetEncoding(hchart1));
                        strWebData = sr.ReadToEnd();
                        goto endthis;
                    }
                }
                //保存到 MemoryStream 供重复读取
                var ms = new MemoryStream();
                var buffer = new byte[1024];
                while (true)
                {
                    if (stream != null)
                    {
                        var sz = stream.Read(buffer, 0, 1024);
                        if (sz == 0) break;
                        ms.Write(buffer, 0, sz);
                    }
                }
                //默认编码读取            
                ms.Position = 0;//指针置于流开头
                if (charSet == "") charSet = "gb2312";
                sr = new StreamReader(ms, Encoding.GetEncoding(charSet));
                strWebData = sr.ReadToEnd();
                //获取网页meta字符编码
                var charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                var webCharSet = charSetMatch.Groups[3].Value.ToLower();
                if (!Encoding.GetEncoding(webCharSet).Equals(Encoding.GetEncoding(charSet)) && webCharSet != "")
                {
                    ms.Position = 0;//指针置于流开头
                    sr = new StreamReader(ms, Encoding.GetEncoding(webCharSet));
                    strWebData = sr.ReadToEnd();
                }
                ms.Close();
            endthis:
                sr.Close();
                stream.Close();
                myHttpWebResponse.Close(); myHttpWebRequest.Abort();
                return strWebData;
            }
            catch (Exception ex) { return "Error:" + ex.Message; }
        }

        /// <summary>
        /// 清除内容中的Html代码
        /// </summary>
        /// <param name="Content"></param>
        /// <returns></returns>
        public string ClearHtml(string Content)
        {
            Content = ReplaceHtml("&#[^>]*;", "", Content);
            Content = ReplaceHtml("</?marquee[^>]*>", "", Content);
            Content = ReplaceHtml("</?object[^>]*>", "", Content);
            Content = ReplaceHtml("</?param[^>]*>", "", Content);
            Content = ReplaceHtml("</?embed[^>]*>", "", Content);
            Content = ReplaceHtml("</?table[^>]*>", "", Content);
            Content = ReplaceHtml(" ", "", Content);
            Content = ReplaceHtml("</?tr[^>]*>", "", Content);
            Content = ReplaceHtml("</?th[^>]*>", "", Content);
            Content = ReplaceHtml("</?p[^>]*>", "", Content);
            Content = ReplaceHtml("</?a[^>]*>", "", Content);
            Content = ReplaceHtml("</?img[^>]*>", "", Content);
            Content = ReplaceHtml("</?tbody[^>]*>", "", Content);
            Content = ReplaceHtml("</?li[^>]*>", "", Content);
            Content = ReplaceHtml("</?span[^>]*>", "", Content);
            Content = ReplaceHtml("</?div[^>]*>", "", Content);
            Content = ReplaceHtml("</?th[^>]*>", "", Content);
            Content = ReplaceHtml("</?td[^>]*>", "", Content);
            Content = ReplaceHtml("</?script[^>]*>", "", Content);
            Content = ReplaceHtml("(javascript|jscript|vbscript|vbs):", "", Content);
            Content = ReplaceHtml("on(mouse|exit|error|click|key)", "", Content);
            Content = ReplaceHtml("<\\?xml[^>]*>", "", Content);
            Content = ReplaceHtml("<\\/?[a-z]+:[^>]*>", "", Content);
            Content = ReplaceHtml("</?font[^>]*>", "", Content);
            Content = ReplaceHtml("</?b[^>]*>", "", Content);
            Content = ReplaceHtml("</?u[^>]*>", "", Content);
            Content = ReplaceHtml("</?i[^>]*>", "", Content);
            Content = ReplaceHtml("</?strong[^>]*>", "", Content);
            string clearHtml = Content;
            return clearHtml;
        }

        /// <summary>
        /// 获取Html代码中所要抓取的链接集合
        /// </summary>
        /// <param name="html"></param>
        /// <param name="url"></param>
        /// <param name="strReg"></param>
        /// <returns></returns>
        public string[] GetLinks(string html, string url,string strReg)
        {
            Collection<string> urls = new Collection<string>();
            MatchCollection matches = new Regex(strReg, RegexOptions.Singleline).Matches(html);
            var regLInk = string.Empty;
            //判断：如果Html源代码中没有 </body> 标签则证明源不完整，重新抓取
            if (!html.Contains("</body>")) 
            {
                var newContent = GetHttpSource(url, "");
                GetLinks(newContent, url, strReg);
            }
            //循环添加
            foreach (Match match in matches)
            {
                regLInk = match.Groups["key"].Value;
                urls.Add(regLInk);
            }
            return urls.ToArray();
        }

惟楚有才

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬虫程序

/// /// 获取抓取链接Html的源代码 /// /// url地址 /// 编码方式、如果传入""则自动获取编码 /// public string GetHttpSource(string url, string charSet = "") { t
复制链接

扫一扫

专栏目录