最近想用C#玩一玩爬虫。就想用试试Abot。
打开VS,搜索nuget,已经2.0版本了:
奈何没有基础,只能再搜搜怎么用,找到了一个案例:
.net使用abot爬虫简单例子_泉诚工作室的博客-CSDN博客_abot使用教程 https://blog.csdn.net/aaa000830/article/details/85116051
就拿过来参考一下。
发现一堆报错,看来因为版本升级,有些东西做了调整,那只能进行一番修改了:
using Abot2.Crawler;
using Abot2.Poco;
using AngleSharp.Dom;
using System.Text;
namespace AbotTest
{
public class AbotTest
{
private static readonly Uri FeedUrl = new Uri("https://www.jd.com/allSort.aspx");//定义一个爬取的url
public static void PageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
}
//单个页面爬取结束
public static void PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
{
if (e.CrawledPage.Uri == FeedUrl)
{
StringBuilder sb = new StringBuilder();
//这里使用AngleSharp解析html
var all = e.CrawledPage.AngleSharpHtmlDocument.QuerySelector(".category-items").Children;
foreach (var col in all)
{
var categorys = col.QuerySelectorAll(".category-item");
foreach (var category in categorys)
{
var first = category.QuerySelector(".item-title span").Text();
sb.Append("\r\n" + first + "\r\n");
var seconds = category.QuerySelector(".items").Children;
foreach (var second in seconds)
{
var secondtext = second.QuerySelector("dt a").Text();
sb.Append(secondtext + "\t");
var thireds = second.QuerySelector("dd").Children;
foreach (var thired in thireds)
{
var thiredtext = thired.Text();
sb.Append(thiredtext + ",");
}
sb.Remove(sb.Length - 1, 1);
}
}
}
//爬取的数据保存到C:\Program Files (x86)\IIS Express下面。注意这里保存可能需要以管理员的身份运行VS
System.IO.File.AppendAllText("fake.txt", sb.ToString());
}
}
/// <summary>
/// 同步方法注册一个委托,以确定是否应该抓取一个页面
/// </summary>
/// <param name="pageToCrawl"></param>
/// <param name="crawlContext"></param>
/// <returns></returns>
public static CrawlDecision ShouldCrawlPageDecisionMaker(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if (pageToCrawl.IsRetry || pageToCrawl.IsRoot || FeedUrl == pageToCrawl.Uri)//判断是否为根Url,爬取的Url是否为我们指定的
{
return new CrawlDecision() { Allow = true };
}
else
{
return new CrawlDecision { Allow = false, Reason = "Not match uri" };//如果为false,就不爬取页面
}
}
/// <summary>
/// 同步方法注册一个委托,以确定页面的内容是否应该被加载
/// </summary>
/// <param name="pageToCrawl"></param>
/// <param name="crawlContext"></param>
/// <returns></returns>
private static CrawlDecision ShouldDownloadPageContentDecisionMaker(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if (pageToCrawl.IsRoot || pageToCrawl.IsRetry || FeedUrl == pageToCrawl.Uri)
{
return new CrawlDecision
{
Allow = true
};
}
return new CrawlDecision { Allow = false, Reason = "Not match uri" };
}
/// <summary>
/// 同步方法注册一个委托,以确定是否应该抓取一个页面的链接
/// </summary>
/// <param name="crawledPage"></param>
/// <param name="crawlContext"></param>
/// <returns></returns>
private static CrawlDecision ShouldCrawlPageLinksDecisionMaker(CrawledPage crawledPage, CrawlContext crawlContext)
{
if (!crawledPage.IsInternal)
return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" };
if (crawledPage.IsRoot || crawledPage.IsRetry || crawledPage.Uri == FeedUrl)
{
return new CrawlDecision { Allow = true };
}
else
{
return new CrawlDecision { Allow = false, Reason = "We only crawl links of pagination pages" };
}
}
public static IWebCrawler GetManuallyConfiguredWebCrawler()
{
//先进行配置
CrawlConfiguration config = new CrawlConfiguration();
config.MaxConcurrentThreads = Environment.ProcessorCount;
config.MaxPagesToCrawl = 1000;
config.IsExternalPageCrawlingEnabled = false;
config.IsUriRecrawlingEnabled = false;
config.IsExternalPageLinksCrawlingEnabled = false;
config.IsRespectRobotsDotTextEnabled = false;
config.DownloadableContentTypes = "text/html, text/plain";
config.MinCrawlDelayPerDomainMilliSeconds = 1000;
config.CrawlTimeoutSeconds = 0;
config.MaxPagesToCrawlPerDomain = 0;
var crawler = new PoliteWebCrawler(config);
//爬取页面前的判断
crawler.ShouldCrawlPageDecisionMaker = ShouldCrawlPageDecisionMaker;
crawler.ShouldDownloadPageContentDecisionMaker = ShouldDownloadPageContentDecisionMaker;
crawler.ShouldCrawlPageLinksDecisionMaker = ShouldCrawlPageLinksDecisionMaker;
//下面是爬取的四个事件
crawler.PageCrawlStarting += PageCrawlStarting;//单个页面爬取开始
crawler.PageCrawlCompleted += PageCrawlCompleted;//单个页面爬取结束
// crawler.PageCrawlDisallowedAsync += PageCrawlDisallowedAsync;// 页面链接不允许爬取事件
//crawler.PageLinksCrawlDisallowedAsync += PageLinksCrawlDisallowedAsync;//页面不允许爬取事件
return crawler;
}
public void Test()
{
var crawler = GetManuallyConfiguredWebCrawler();
var reuslt = crawler.CrawlAsync(FeedUrl);
}
}
}
修改完成。
然后调用一下Test()方法,就可以测试一下。
跑成功之后,会生成一个fake.txt文件。
找到它之后,看一下:
完美!