DotnetSpider
DotnetSpider 是一个轻量、灵活、高性能、跨平台的分布式网络爬虫框架,可以帮助 .NET 工程师快速的完成爬虫的开发。
目标
第一个简单的爬虫需要达到的目标:
- 从博客园抓取博客;
- 抓取标题、作者、发布时间、正文Url;
准备工作
- 新建控制台项目
- Nuget添加DotnetSpider、Serilog.AspNetCore、Serilog.Sinks.Console、Serilog.Sinks.RollingFile、Serilog.Sinks.PeriodicBatching
开始
- 创建BlogSpider.cs并添加以下代码
public class BlogSpider : Spider
{
public BlogSpider(IOptions<SpiderOptions> options, SpiderServices services, ILogger<Spider> logger) : base(
options, services, logger)
{
}
protected override async Task InitializeAsync(CancellationToken stoppingToken)
{
// 添加自定义解析
AddDataFlow(new Parser());
// 使用控制台存储器
AddDataFlow(new ConsoleStorage());
// 添加采集请求
await AddRequestsAsync("");
}
class Parser : DataParser
{
protected override Task Parse(DataContext context)
{
var selectable = context.Selectable;
return Task.CompletedTask;
}
}
}
- 创建Blog.cs并添加以下代码
public class Blog
{
public string Title { get; set; }
public string Time { get; set; }
public string Author { get; set; }
public string ContentUrl { get; set; }
}
- 找出博客园博客列表请求地址
url:https://www.cnblogs.com/sitehome/p/x - 修改采集请求地址
protected override async Task InitializeAsync(CancellationToken stoppingToken)
{
// 添加自定义解析
AddDataFlow(new Parser());
// 使用控制台存储器
AddDataFlow(new ConsoleStorage());
List<string> urlList = new List<string>();
for (int i = 1; i < 11; i++)
{
urlList.Add($"https://www.cnblogs.com/sitehome/p/{i}");
}
// 添加采集请求
await AddRequestsAsync(urlList.ToArray());
}
- 添加解析逻辑
protected override Task Parse(DataContext context)
{
var selectable = context.Selectable;
//取得本页博客列表
var postList = selectable.XPath(".//div[@id='post_list']").Nodes();
List<Blog> blogList = new List<Blog>();
foreach (var postItem in postList)
{
//解析标题
var title = postItem.XPath(".//a[@class='titlelnk']")?.Value;
if (!string.IsNullOrEmpty(title))
{
//解析发布时间文本内容
var time = postItem.XPath(".//div[@class='post_item_foot']")?.Value;
//时间文本提取
time = time.Replace("\r", string.Empty).Replace("\n", string.Empty);
time = time.Substring(time.IndexOf("发布于") + 3);
time = time.Substring(0, time.IndexOf("评论(")).TrimStart(' ').TrimEnd(' ');
//解析作者
var author = postItem.XPath(".//div[@class='post_item_foot']/a")?.Value;
//解析正文链接
var contentUrl = postItem.XPath(".//div[@class='post_item_body']//a[@class='titlelnk']/@href")?.Value;
var blog = new Blog();
blog.Title = title;
blog.Time = time;
blog.Author = author;
blog.ContentUrl = contentUrl;
blogList.Add(blog);
}
}
context.AddData("Blogs", blogList);
return Task.CompletedTask;
}
- 修改Program.cs
static async System.Threading.Tasks.Task Main(string[] args)
{
Log.Logger = new LoggerConfiguration()
.MinimumLevel.Information()
.MinimumLevel.Override("Microsoft.Hosting.Lifetime", LogEventLevel.Warning)
.MinimumLevel.Override("Microsoft", LogEventLevel.Warning)
.MinimumLevel.Override("System", LogEventLevel.Warning)
.MinimumLevel.Override("Microsoft.AspNetCore.Authentication", LogEventLevel.Warning)
.Enrich.FromLogContext()
.WriteTo.Console().WriteTo.RollingFile("logs/spiders.log")
.CreateLogger();
var builder = Builder.CreateDefaultBuilder<BlogSpider>(options =>
{
// 每秒 1 个请求
options.Speed = 1;
// 请求超时
options.RequestTimeout = 10;
});
builder.UseSerilog();
builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>();
await builder.Build().RunAsync();
Environment.Exit(0);
}
运行
总结
虽然是一个Helloworld项目,但是对于xpath的使用需要有一定的了解