时间匹配小例子

/// <summary>
        /// 根据网页信息得到文章集合
        /// </summary>
        protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID)
        {
            List<CrawlerResult> arrayList = new List<CrawlerResult>();
            MatchCollection matchList;
            MatchCollection tempMatch;
            //Regex regex = new Regex(@"<p class=\042item\042><a href=[^>]*>[\s\S]+?</p>");
            Regex regex = new Regex(@"<p class=""t4"">[\s\S]+?</p>");
            Regex regexHref = new Regex("<a href=[^>]*>评论[^>]*</a>[^<]*<span");
            Regex regexTime = new Regex(@"[\d]{1,2}月[\d]{1,2}日 [\d]{1,2}:[\d]{1,2}");
            //Regex title = new Regex(@"<p class=\042item\042>[\s\S]+?评论");
            Regex title = new Regex(@"<p class=""t4"">[\s\S]+?评论");
            Regex Author = new Regex(@"<a href=""/t2/othdoc.do[^<]*</a>");//作者

            matchList = regex.Matches(HTMLContent.ToLower());
            for (int i = 0; i < matchList.Count; i++)
            {
                CrawlerResult item = new CrawlerResult();
                item.Task_ID = task_ID;
                if (matchList[i].Value.ToString() != "")
                {
                    //URL
                    tempMatch = regexHref.Matches(matchList[i].Value.ToString());
                    if (tempMatch.Count > 0)
                    {
                        string str = GetURL(tempMatch[0].Value);
                      
                        item.Url = "http://w.sohu.com" + str;
                        if (item.Url.StartsWith("http://", StringComparison.OrdinalIgnoreCase))
                        {
                            //主题 
                            tempMatch = title.Matches(matchList[i].Value);
                            if (tempMatch.Count > 0)
                            {
                                //string tle = 
                                //if (tle.Length > 30)
                                //{
                                //    item.Title = tle.Substring(1,25)+"......";
                                //}
                                //else
                                item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());

                            }
                            //作者 
                            tempMatch = Author.Matches(matchList[i].Value);
                            if (tempMatch.Count > 0)
                            {
                                //string tle = 
                                //if (tle.Length > 30)
                                //{
                                //    item.Title = tle.Substring(1,25)+"......";
                                //}
                                //else
                                item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());
                            }
                                //媒体
                            item.SiteName = "XXXXXX";

                       
-----------------------------------------------------------------------------------------------------------------------------------------
                          //时间
                            tempMatch = regexTime.Matches(matchList[i].Value.ToString());
                            if (tempMatch.Count > 0)
                            {
                                try
                                {
                                    string Ctime = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()).Replace("月", "-").Replace("日", "-");
                                    item.CreateTime = DateTime.Parse(tempMatch[0].Value);
                                }
                                catch
                                {

                                    CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + matchList[i].Value, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);
                                }
                            }
                            else
                            {
                                if (tempMatch.Count == 0)
                                {
                                    //Regex Time = new Regex(@"</a> <span class=\042time\042>.*?</span>");
                                    Regex Time = new Regex(@"<span class=""time"">[\d]{1,2}[\s\S]+?</span>");
                                    tempMatch = Time.Matches(matchList[i].Value.ToString());
                                }
                                if (tempMatch.Count > 0)
                                {
                                    string time = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());
                                    DateTime terstr = DateTime.Now;
                                    Regex ter = new Regex(@"[\d]{1,2}:[\d]{1,2}");
                                    Regex timeReg = new Regex(@"[\d]{4}-[\d]{1,2}-[\d]{1,2}");
                                    tempMatch = timeReg.Matches(time);
                                    if (tempMatch.Count > 0)
                                    {
                                        try
                                        {
                                            item.CreateTime = DateTime.Parse(tempMatch[0].Value);
                                        }
                                        catch
                                        {

                                            CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + time, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);
                                        }
                                    }
                                    if (tempMatch.Count == 0)
                                    {
                                        int TimeNumber = 0;
                                        string Timetype = "";

                                        if (time.IndexOf("分钟前") > 0)
                                        {
                                            timeReg = new Regex(@"([\d]{1,2}) 分钟前");
                                            Timetype = "h";
                                        }
                                        else if (time.IndexOf("小时前") > 0)
                                        {
                                            timeReg = new Regex(@"([\d]{1,2}) 小时前");
                                            Timetype = "k";
                                        }
                                        else if (time.IndexOf("昨天") > -1)
                                        {
                                            timeReg = new Regex(@"昨天");
                                            Timetype = "f";
                                        }
                                        else if (time.IndexOf("前天") > -1)
                                        {
                                            timeReg = new Regex(@"前天");
                                            Timetype = "m";
                                        }
                                        else if (time.IndexOf("天前") > 0)
                                        {
                                            timeReg = new Regex(@"([\d]{1,2}) 天前");
                                            Timetype = "d";
                                        }
                                        tempMatch = timeReg.Matches(time);
                                        if (tempMatch.Count > 0)
                                        {
                                            try
                                            {
                                                TimeNumber = int.Parse(tempMatch[0].Groups[1].Value);
                                            }
                                            catch
                                            {

                                                CommonFunction.logWirte(this.SearchName + "抓取匹配时间2出错:源是" + time, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);
                                            }
                                        }
                                        switch (Timetype)
                                        {
                                            case "h":
                                                item.CreateTime = DateTime.Now.AddMinutes(-TimeNumber);
                                                break;
                                            case "k":
                                                item.CreateTime = DateTime.Now.AddHours(-TimeNumber);
                                                break;
                                            case "f":
                                                tempMatch = ter.Matches(matchList[i].Value.ToString());
                                                if (tempMatch.Count > 0)
                                                {
                                                    terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));
                                                }
                                                item.CreateTime = terstr.AddDays(-1);
                                                break;
                                            case "m":
                                                tempMatch = ter.Matches(matchList[i].Value.ToString());
                                                if (tempMatch.Count > 0)
                                                {
                                                    terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));
                                                }
                                                item.CreateTime = terstr.AddDays(-2);
                                                break;
                                            case "d":
                                                tempMatch = ter.Matches(matchList[i].Value.ToString());
                                                if (tempMatch.Count > 0)
                                                {
                                                    terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));
                                                }
                                                item.CreateTime = terstr.AddDays(-TimeNumber);
                                                break;
                                        }

                                    }
--------------------------------------------------------------------------------------------------------------------------------------
                                }
                            }
                            arrayList.Add(item);
                        }
                    }
                }
            }
            return arrayList;
        }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值