/// <summary>
/// 根据网页信息得到文章集合
/// </summary>
protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID)
{
List<CrawlerResult> arrayList = new List<CrawlerResult>();
MatchCollection matchList;
MatchCollection tempMatch;
//Regex regex = new Regex(@"<p class=\042item\042><a href=[^>]*>[\s\S]+?</p>");
Regex regex = new Regex(@"<p class=""t4"">[\s\S]+?</p>");
Regex regexHref = new Regex("<a href=[^>]*>评论[^>]*</a>[^<]*<span");
Regex regexTime = new Regex(@"[\d]{1,2}月[\d]{1,2}日 [\d]{1,2}:[\d]{1,2}");
//Regex title = new Regex(@"<p class=\042item\042>[\s\S]+?评论");
Regex title = new Regex(@"<p class=""t4"">[\s\S]+?评论");
Regex Author = new Regex(@"<a href=""/t2/othdoc.do[^<]*</a>");//作者
matchList = regex.Matches(HTMLContent.ToLower());
for (int i = 0; i < matchList.Count; i++)
{
CrawlerResult item = new CrawlerResult();
item.Task_ID = task_ID;
if (matchList[i].Value.ToString() != "")
{
//URL
tempMatch = regexHref.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
string str = GetURL(tempMatch[0].Value);
item.Url = "http://w.sohu.com" + str;
if (item.Url.StartsWith("http://", StringComparison.OrdinalIgnoreCase))
{
//主题
tempMatch = title.Matches(matchList[i].Value);
if (tempMatch.Count > 0)
{
//string tle =
//if (tle.Length > 30)
//{
// item.Title = tle.Substring(1,25)+"......";
//}
//else
item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());
}
//作者
tempMatch = Author.Matches(matchList[i].Value);
if (tempMatch.Count > 0)
{
//string tle =
//if (tle.Length > 30)
//{
// item.Title = tle.Substring(1,25)+"......";
//}
//else
item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());
}
//媒体
item.SiteName = "XXXXXX";
-----------------------------------------------------------------------------------------------------------------------------------------
//时间
tempMatch = regexTime.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
try
{
string Ctime = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()).Replace("月", "-").Replace("日", "-");
item.CreateTime = DateTime.Parse(tempMatch[0].Value);
}
catch
{
CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + matchList[i].Value, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);
}
}
else
{
if (tempMatch.Count == 0)
{
//Regex Time = new Regex(@"</a> <span class=\042time\042>.*?</span>");
Regex Time = new Regex(@"<span class=""time"">[\d]{1,2}[\s\S]+?</span>");
tempMatch = Time.Matches(matchList[i].Value.ToString());
}
if (tempMatch.Count > 0)
{
string time = CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString());
DateTime terstr = DateTime.Now;
Regex ter = new Regex(@"[\d]{1,2}:[\d]{1,2}");
Regex timeReg = new Regex(@"[\d]{4}-[\d]{1,2}-[\d]{1,2}");
tempMatch = timeReg.Matches(time);
if (tempMatch.Count > 0)
{
try
{
item.CreateTime = DateTime.Parse(tempMatch[0].Value);
}
catch
{
CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + time, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);
}
}
if (tempMatch.Count == 0)
{
int TimeNumber = 0;
string Timetype = "";
if (time.IndexOf("分钟前") > 0)
{
timeReg = new Regex(@"([\d]{1,2}) 分钟前");
Timetype = "h";
}
else if (time.IndexOf("小时前") > 0)
{
timeReg = new Regex(@"([\d]{1,2}) 小时前");
Timetype = "k";
}
else if (time.IndexOf("昨天") > -1)
{
timeReg = new Regex(@"昨天");
Timetype = "f";
}
else if (time.IndexOf("前天") > -1)
{
timeReg = new Regex(@"前天");
Timetype = "m";
}
else if (time.IndexOf("天前") > 0)
{
timeReg = new Regex(@"([\d]{1,2}) 天前");
Timetype = "d";
}
tempMatch = timeReg.Matches(time);
if (tempMatch.Count > 0)
{
try
{
TimeNumber = int.Parse(tempMatch[0].Groups[1].Value);
}
catch
{
CommonFunction.logWirte(this.SearchName + "抓取匹配时间2出错:源是" + time, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);
}
}
switch (Timetype)
{
case "h":
item.CreateTime = DateTime.Now.AddMinutes(-TimeNumber);
break;
case "k":
item.CreateTime = DateTime.Now.AddHours(-TimeNumber);
break;
case "f":
tempMatch = ter.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));
}
item.CreateTime = terstr.AddDays(-1);
break;
case "m":
tempMatch = ter.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));
}
item.CreateTime = terstr.AddDays(-2);
break;
case "d":
tempMatch = ter.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
terstr = DateTime.Parse(CommonFunction.DeleteHTMLElement(tempMatch[0].Value.ToString()));
}
item.CreateTime = terstr.AddDays(-TimeNumber);
break;
}
}
--------------------------------------------------------------------------------------------------------------------------------------
}
}
arrayList.Add(item);
}
}
}
}
return arrayList;
}