一.Demo
该Demo是爬博客园的代码
using Abot.Crawler;
using Abot.Poco;
using CsQuery.HtmlParser;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace Abot.Demo
{
class Test
{
/// <summary>
/// 要跑的网页这里是博客园
/// </summary>
public static readonly Uri FeedUrl = new Uri(@"https://news.cnblogs.com/");
/// <summary>
///博客园详细页面的正则表达式
/// </summary>
public static Regex NewsUrlRegex = new Regex("^https://news.cnblogs.com/n/\\d+/$", RegexOptions.Compiled);
/// <summary>
/// 博客园分页的正则表达式
/// </summary>
public static Regex NewsPageRegex = new Regex("^https://news.cnblogs.com/n/page/\\d+/$", RegexOptions.Compiled);
static void Main(string[] args)
{
WriteLog("Begin");
var crawler = GetManuallyConfiguredWebCrawler();
var result = crawler.Crawl(FeedUrl);
System.Console.WriteLine(result.ErrorException);
WriteLog("end");
}
public static IWebCrawler GetManuallyConfiguredWebCrawler()
{
//创建配置文件
CrawlConfiguration config = new CrawlConfiguration();
//连接超时
config.CrawlTimeoutSeconds = 0;
//下载类容格式
config.DownloadableContentTypes = "text/html, text/plain";
//是否爬扩展页面
config.IsExternalPageCrawlingEnabled = false;
//是否爬扩展连接
config.IsExternalPageLinksCrawlingEnabled = false;
//是否爬的检索到rebots.txt文件,可以要个
config.IsRespectRobotsDotTextEnabled = true;
//是否多重复爬Uri,一般为false,但我估计太大,内存受不了,应为内存会存是否爬过的数据
config.IsUriRecrawlingEnabled = false;
//请求的最大线程,看IIS的支持,太大服务器受不了
config.MaxConcurrentThreads = System.Environment.ProcessorCount;
//最大爬的页码连接,如果为0就没有限制,看需求大小
config.MaxPagesToCrawl = 1000;
//单页面最大的爬页面量,如果为0就没有限制,基本都为0
config.MaxPagesToCrawlPerDomain = 0;
//每爬一个页面等好多毫秒,太快CUP会受不了
config.MinCrawlDelayPerDomainMilliSeconds = 1000;
//创建一个爬虫实例
var crawler = new PoliteWebCrawler(config, null, null, null, null, null, null, null, null);
//是否爬该网页
crawler.ShouldCrawlPage(ShouldCrawlPage);
//是否爬该网页连接
crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);
//是否下载该页面
crawler.ShouldDownloadPageContent(ShouldDownloadPageContent);
//单个页面爬取开始
crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
//单个页面爬取结束
crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedAsync;
//页面不允许爬取事件
crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
//页面链接不允许爬取事件
crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
return crawler;
}
/// <summary>
/// 单个页面爬取开始
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
WriteLog("crawler_ProcessPageCrawlStarting:" + pageToCrawl.Uri.AbsoluteUri);
}
/// <summary>
/// 页面不允许爬取事件
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
static void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
WriteLog("crawler_PageCrawlDisallowed:" + pageToCrawl.Uri.AbsoluteUri);
}
/// <summary>
/// 页面链接不允许爬取事件
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
static void crawler_PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e)
{
CrawledPage pageToCrawl = e.CrawledPage;
WriteLog("crawler_PageLinksCrawlDisallowed:" + pageToCrawl.Uri.AbsoluteUri);
}
/// <summary>
/// 单个页面爬取结束
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
public static void crawler_ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
{
WriteLog("crawler_ProcessPageCrawlCompletedAsync:" + e.CrawledPage.Uri.AbsoluteUri);
//判断是否是新闻详细页面
if (NewsUrlRegex.IsMatch(e.CrawledPage.Uri.AbsoluteUri))
{
var csTitle = e.CrawledPage.CsQueryDocument.Select("#news_title");
var linkDom = csTitle.FirstElement().FirstChild;
//var newsInfo = e.CrawledPage.CsQueryDocument.Select("#news_info");
//var dateString = newsInfo.Select(".time", newsInfo);
var str = (e.CrawledPage.Uri.AbsoluteUri + "\t" + HtmlData.HtmlDecode(linkDom.InnerText)) + "\r\n";
System.IO.File.AppendAllText("fake.txt", str);
WriteLog(str);
}
}
/// <summary>
/// 如果是Feed页面或者分页或者详细页面才需要爬取
/// </summary>
private static CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext context)
{
if (pageToCrawl.IsRoot || pageToCrawl.IsRetry || FeedUrl == pageToCrawl.Uri
|| NewsPageRegex.IsMatch(pageToCrawl.Uri.AbsoluteUri)
|| NewsUrlRegex.IsMatch(pageToCrawl.Uri.AbsoluteUri))
{
WriteLog("ShouldCrawlPage true:" + pageToCrawl.Uri.AbsoluteUri);
return new CrawlDecision { Allow = true };
}
else
{
WriteLog("ShouldCrawlPage false:" + pageToCrawl.Uri.AbsoluteUri);
return new CrawlDecision { Allow = false, Reason = "Not match uri" };
}
}
/// <summary>
/// 如果是Feed页面或者分页或者详细页面才需要爬取
/// </summary>
private static CrawlDecision ShouldDownloadPageContent(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if (pageToCrawl.IsRoot || pageToCrawl.IsRetry || FeedUrl == pageToCrawl.Uri
|| NewsPageRegex.IsMatch(pageToCrawl.Uri.AbsoluteUri)
|| NewsUrlRegex.IsMatch(pageToCrawl.Uri.AbsoluteUri))
{
WriteLog("ShouldDownloadPageContent true:" + pageToCrawl.Uri.AbsoluteUri);
return new CrawlDecision
{
Allow = true
};
}
WriteLog("ShouldDownloadPageContent false:" + pageToCrawl.Uri.AbsoluteUri);
return new CrawlDecision { Allow = false, Reason = "Not match uri" };
}
/// <summary>
/// 是否爬去网页连接
/// </summary>
/// <param name="crawledPage"></param>
/// <param name="crawlContext"></param>
/// <returns></returns>
private static CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
{
if (!crawledPage.IsInternal)
return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" };
if (crawledPage.IsRoot || crawledPage.IsRetry || crawledPage.Uri == FeedUrl
|| NewsPageRegex.IsMatch(crawledPage.Uri.AbsoluteUri))
{
WriteLog("ShouldCrawlPageLinks true:" + crawledPage.Uri.AbsoluteUri);
return new CrawlDecision { Allow = true };
}
else
{
WriteLog("ShouldCrawlPageLinks false:" + crawledPage.Uri.AbsoluteUri);
return new CrawlDecision { Allow = false, Reason = "We only crawl links of pagination pages" };
}
}
private static void WriteLog(string log)
{
System.IO.File.AppendAllText("log.txt", log);
System.IO.File.AppendAllText("log.txt", "\r\n");
Console.WriteLine(log);
}
}
}
二.日志分析
开始日志:
结束日志:
判断日志:
三.总结
1.Abot先开始爬初始页面,如果有连接,会调用ShouldCrawlPage是否爬该网页,如果是,就会开一个线程开始爬,如此而已!
2.爬虫完毕后会调用crawler.PageCrawlCompletedAsyn的回调函数,这里可以处理爬出来的结果
3.爬出来的网页可以用CsQuery处理,基本操作
获得ID下的值:
var csTitle = e.CrawledPage.CsQueryDocument.Select("#news_title");
var linkDom = csTitle.FirstElement().FirstChild;
HtmlData.HtmlDecode(linkDom.InnerText))
获得Class的值:
var newsInfo = e.CrawledPage.CsQueryDocument.Select("#news_info");
var dateString = newsInfo.Select(".time", newsInfo);
完!不要@我,我很忙,谢谢!