/// <summary>
/// 根据任务生成抓取要素
/// </summary>
protected override void initCrawlerModel(IwomTask taskItem)
{
crawlerModel = new CrawlerModel();
crawlerModel.Keyword = CommonFunction.AssembledKeyword(taskItem.KeyWord, IWOMWebCrawlerDbLayer.Common.KeyWordUrlEncode.Normal, true); //关键词
crawlerModel.PageSize = taskItem.GetItems > 20 ? 20 : taskItem.GetItems; //每页大小
crawlerModel.Postion = taskItem.Task_Postion;
}
/// <summary>
/// 根据任务要素构造抓取的url
/// </summary>
protected override string createUrl(int pageIndex)
{
//需要修改
// string strUrl = "http://cbnsearch.yicai.com/cbnsearch.html?start=0&pagecount=20&documentType=1&datetype=1&contenttype=1&searchKeyWords=" + crawlerModel.Keyword;
string strUrl = "http://cbnsearch.yicai.com/searchresult.php?start=0&pagecount=20&documentType=1&datetype=1&searchKeyWords=" + crawlerModel.Keyword + "&contenttype=1";
return strUrl;
}
/// <summary>
/// 每抓取一页都间隔的时间
/// </summary>
protected override void PageSleep()
{
Thread.Sleep(500);
}
/// <summary>
/// 页面的编码
/// </summary>
protected override Encoding getPageEncoding()
{
return Encoding.UTF8;
}
/// <summary>
/// 根据内容判断是否被封禁了
/// </summary>
protected override bool checkContentIsForbat(string HTMLContent)
{
return true;
}
/// <summary>
/// 根据内容判断是否是最后一页了
/// </summary>
protected override bool checkContentIsLastPage(string HTMLContent)
{
return false;
}
/// <summary>
/// 根据网页信息得到文章集合
/// </summary>
protected override List<CrawlerResult> GetArticleByHtml(string HTMLContent, int task_ID)
{
HTMLContent = NormalU2C(HTMLContent);
List<CrawlerResult> arrayList = new List<CrawlerResult>();
MatchCollection matchList;
MatchCollection tempMatch;
Regex regex = new Regex(@"{\042id\042:[\s\S]+?}");//每一条记录
Regex regexHref = new Regex(@"\042url\042:\042(?<href>.*?)\042,\042title\042:\042(?<name>.*?)\042,");//标题,链接的地址
Regex regexContent = new Regex(@"\042content\042:\042(?<content>[\s\S]+?)\042,");//摘要
Regex regexTime = new Regex(@"\042creationDate\042:\042(?<time>[\s\S]+?)\042,");//时间
Regex regexAuthor = new Regex(@"\042author\042:\042(?<author>[\s\S]+?)\042,");//作者
matchList = regex.Matches(HTMLContent);
for (int i = 0; i < matchList.Count; i++)
{
CrawlerResult item = new CrawlerResult();
item.Task_ID = task_ID;
if (matchList[i].Value.ToString() != "")
{
tempMatch = regexHref.Matches(matchList[i].Value.ToString());
//if (item.Url.StartsWith("http:", StringComparison.OrdinalIgnoreCase))
// {
//主题
item.Title = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["name"].Value.ToString());
//内容
tempMatch = regexContent.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
item.Summary = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["content"].Value.ToString());
}
//作者
tempMatch = regexAuthor.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
item.Author = CommonFunction.DeleteHTMLElement(tempMatch[0].Groups["author"].Value.ToString());
}
//媒体
item.SiteName = "一财网";
//时间
tempMatch = regexTime.Matches(matchList[i].Value.ToString());
if (tempMatch.Count > 0)
{
if (tempMatch.Count > 0 && tempMatch[0].Value.ToString().IndexOf('-') > 0)
{
try
{
item.CreateTime = DateTime.Parse(tempMatch[0].Groups["time"].Value.ToString());
}
catch
{
CommonFunction.logWirte(this.SearchName + "抓取匹配时间出错:源是" + matchList[i].Value, IWOMWebCrawlerDbLayer.Common.LogGrade.Warning);
}
}
}
else
{
item.CreateTime = DateTime.Parse(DateTime.Now.ToShortDateString());
}
//URL
tempMatch = regexHref.Matches(matchList[i].Value.ToString());
item.Url = "http://www.yicai.com/news/" + item.CreateTime.ToString().Substring(0,4) + tempMatch[0].Groups["href"].ToString().Replace(@"\", "").Substring(26);
item.FilterType = FilterType.FilterNo;
arrayList.Add(item);
// }
}
}
return arrayList;
}
/// <summary>
/// 生成测试任务的方法
/// </summary>
protected override string initTestUrl()
{
this.HaseLastPostTime = true;
this.HasePageSize = 20;
HaseAuthor = true;
HaseReplyCount = true;
HaseVisitCount = true;
return "http://cbnsearch.yicai.com/searchresult.php?start=0&pagecount=20&documentType=1&datetype=1&searchKeyWords=%E4%BA%9A%E9%A9%AC%E9%80%8A&contenttype=1";
}
#region 解码
private string NormalU2C(string input)
{
string str = "";
char[] chArray = input.ToCharArray();
Encoding bigEndianUnicode = Encoding.UTF8;
for (int i = 0; i < chArray.Length; i++)
{
char ch = chArray[i];
if (ch.Equals('\\'))
{
i++;
i++;
char[] chArray2 = new char[4];
int index = 0;
index = 0;
while ((index < 4) && (i < chArray.Length))
{
chArray2[index] = chArray[i];
index++;
i++;
}
if (index == 4)
{
try
{
str = str + this.UnicodeCode2Str(chArray2);
}
catch (Exception)
{
str = str + @"/";
for (int j = 0; j < index; j++)
{
str = str + chArray2[j];
}
}
i--;
}
else
{
str = str + @"/";
for (int k = 0; k < index; k++)
{
str = str + chArray2[k];
}
}
}
else
{
str = str + ch.ToString();
}
}
return str;
}
private string UnicodeCode2Str(char[] u4)
{
if (u4.Length < 4)
{
throw new Exception("It's not a unicode code array");
}
string str = "0123456789ABCDEF";
char ch = char.ToUpper(u4[0]);
char ch2 = char.ToUpper(u4[1]);
char ch3 = char.ToUpper(u4[2]);
char ch4 = char.ToUpper(u4[3]);
int index = str.IndexOf(ch);
int num2 = str.IndexOf(ch2);
int num3 = str.IndexOf(ch3);
int num4 = str.IndexOf(ch4);
if (((index == -1) || (num2 == -1)) || ((num3 == -1) || (num4 == -1)))
{
throw new Exception("It's not a unicode code array");
}
byte num5 = (byte)(((index * 0x10) + num2) & 0xff);
byte num6 = (byte)(((num3 * 0x10) + num4) & 0xff);
byte[] bytes = new byte[] { num5, num6 };
return Encoding.BigEndianUnicode.GetString(bytes);
}
#endregion