抓取网址中的信息(需要解码的)

最新推荐文章于 2022-05-10 22:49:32 发布
hjingtao
最新推荐文章于 2022-05-10 22:49:32 发布
阅读量728
点赞数
文章标签： regex string exception byte encoding 任务
本文链接：https://blog.csdn.net/hjingtao/article/details/7784454
版权
/// <summary>
        /// 根据任务生成抓取要素
        /// </summary>
        protected override void initCrawlerModel(IwomTask taskItem)
        {
            crawlerModel = new CrawlerModel();
            crawlerModel.Keyword = CommonFunction.AssembledKeyword(taskItem.KeyWord, IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, true);     //关键词
            crawlerModel.PageSize = taskItem.GetItems > 20 ? 20 : taskItem.GetItems;       //每页大小
            crawlerModel.Postion = taskItem.Task_Postion;
        }
        /// <summary>
        /// 根据任务要素构造抓取的url
        /// </summary>
        protected override string createUrl(int pageIndex)
        {
            //需要修改
           // string strUrl = "http://cbnsearch.yicai.com/cbnsearch.html?start=0&pagecount=20&documentType=1&datetype=1&contenttype=1&searchKeyWords=" + crawlerModel.Keyword;
            string strUrl = "http://cbnsearch.yicai.com/searchresult.php?start=0&pagecount=20&documentType=1&datetype=1&searchKeyWords=" + crawlerModel.Keyword + "&contenttype=1";
            return strUrl;


        }
        /// <summary>
        /// 每抓取一页都间隔的时间
        /// </summary>
        protected override void PageSleep()
        {
            Thread.Sleep(500);
        }
        /// <summary>
        /// 页面的编码
        /// </summary>
        protected override Encoding getPageEncoding()
        {
            return Encoding.UTF8;
        }
        /// <summary>
        /// 根据内容判断是否被封禁了
        /// </summary>
        protected override bool checkContentIsForbat(string HTMLContent)
        {
            return true;
        }
        /// <summary>
        /// 根据内容判断是否是最后一页了
        /// </summary>
        protected override bool checkContentIsLastPage(string HTMLContent)
        {
            return false;
        }

        /// <summary>
        /// 根据网页信息得到文章集合
        /// </summary>
        protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID)
        {

           

            HTMLContent = NormalU2C(HTMLContent);
            List<CrawlerResult> arrayList = new List<CrawlerResult>();
            MatchCollection matchList;
            MatchCollection tempMatch;
            Regex regex = new Regex(@"{\042id\042:[\s\S]+?}");//每一条记录
            Regex regexHref = new Regex(@"\042url\042:\042(?<href>.*?)\042,\042title\042:\042(?<name>.*?)\042,");//标题，链接的地址
            Regex regexContent = new Regex(@"\042content\042:\042(?<content>[\s\S]+?)\042,");//摘要
            Regex regexTime = new Regex(@"\042creationDate\042:\042(?<time>[\s\S]+?)\042,");//时间
            Regex regexAuthor = new Regex(@"\042author\042:\042(?<author>[\s\S]+?)\042,");//作者
            matchList = regex.Matches(HTMLContent);
            for (int i = 0; i < matchList.Count; i++)
            {
                CrawlerResult item = new CrawlerResult();
                item.Task_ID = task_ID;
                if (matchList[i].Value.ToString() != "")
                {
                    tempMatch = regexHref.Matches(matchList[i].Value.ToString());
                       //if (item.Url.StartsWith("http:", StringComparison.OrdinalIgnoreCase))
                       // {
                            //主题
                            item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["name"].Value.ToString());
                          
                            //内容
                            tempMatch = regexContent.Matches(matchList[i].Value.ToString());
                            if (tempMatch.Count > 0)
                            {
                                item.Summary = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["content"].Value.ToString());
                            }
                            //作者
                            tempMatch = regexAuthor.Matches(matchList[i].Value.ToString());
                            if (tempMatch.Count > 0)
                            {
                                item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["author"].Value.ToString());
                            }
                            //媒体
                            item.SiteName = "一财网";
                            //时间
                            tempMatch = regexTime.Matches(matchList[i].Value.ToString());
                            if (tempMatch.Count > 0)
                            {
                                if (tempMatch.Count > 0 && tempMatch[0].Value.ToString().IndexOf('-') > 0)
                                {
                                    try
                                    {
                                        item.CreateTime = DateTime.Parse(tempMatch[0].Groups["time"].Value.ToString());
                                    }
                                    catch
                                    {
                                        CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错：源是" + matchList[i].Value, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);
                                    }
                                }
                            }
                            else
                            {
                                item.CreateTime = DateTime.Parse(DateTime.Now.ToShortDateString());
                            }

                            //URL
                            tempMatch = regexHref.Matches(matchList[i].Value.ToString());
                            item.Url = "http://www.yicai.com/news/" + item.CreateTime.ToString().Substring(0,4) + tempMatch[0].Groups["href"].ToString().Replace(@"\", "").Substring(26);

                            item.FilterType = FilterType.FilterNo;
                            arrayList.Add(item);
                      //  }
                       
                }
            }
            return arrayList;
        }

        /// <summary>
        /// 生成测试任务的方法
        /// </summary>
        protected override string initTestUrl()
        {
            this.HaseLastPostTime = true;
            this.HasePageSize = 20;
            HaseAuthor = true;
            HaseReplyCount = true;
            HaseVisitCount = true;
            return "http://cbnsearch.yicai.com/searchresult.php?start=0&pagecount=20&documentType=1&datetype=1&searchKeyWords=%E4%BA%9A%E9%A9%AC%E9%80%8A&contenttype=1";
        }

        #region 解码
        private string NormalU2C(string input)
        {
            string str = "";
            char[] chArray = input.ToCharArray();
            Encoding bigEndianUnicode = Encoding.UTF8;
            for (int i = 0; i < chArray.Length; i++)
            {
                char ch = chArray[i];
                if (ch.Equals('\\'))
                {
                    i++;
                    i++;
                    char[] chArray2 = new char[4];
                    int index = 0;
                    index = 0;
                    while ((index < 4) && (i < chArray.Length))
                    {
                        chArray2[index] = chArray[i];
                        index++;
                        i++;
                    }
                    if (index == 4)
                    {
                        try
                        {
                            str = str + this.UnicodeCode2Str(chArray2);
                        }
                        catch (Exception)
                        {
                            str = str + @"/";
                         
                            for (int j = 0; j < index; j++)
                            {
                                str = str + chArray2[j];
                            }
                        }
                        i--;
                    }
                    else
                    {
                        str = str + @"/";
                      
                        for (int k = 0; k < index; k++)
                        {
                            str = str + chArray2[k];
                        }
                    }
                }
                else
                {
                    str = str + ch.ToString();
                }
            }
            return str;
        }
        private string UnicodeCode2Str(char[] u4)
        {
            if (u4.Length < 4)
            {
                throw new Exception("It's not a unicode code array");
            }
            string str = "0123456789ABCDEF";
            char ch = char.ToUpper(u4[0]);
            char ch2 = char.ToUpper(u4[1]);
            char ch3 = char.ToUpper(u4[2]);
            char ch4 = char.ToUpper(u4[3]);
            int index = str.IndexOf(ch);
            int num2 = str.IndexOf(ch2);
            int num3 = str.IndexOf(ch3);
            int num4 = str.IndexOf(ch4);
            if (((index == -1) || (num2 == -1)) || ((num3 == -1) || (num4 == -1)))
            {
                throw new Exception("It's not a unicode code array");
            }
            byte num5 = (byte)(((index * 0x10) + num2) & 0xff);
            byte num6 = (byte)(((num3 * 0x10) + num4) & 0xff);
            byte[] bytes = new byte[] { num5, num6 };
            return Encoding.BigEndianUnicode.GetString(bytes);
        }
        #endregion