写了一个小爬虫,把CSDN上发表的博客全都备份了下。
获取发表过的文章信息,存入到数据库。
C#中用 AngleSharp这个组件就可以像用linq一样就行html标签的查询操作。
所以从html里获取需要的内容是非常方便的
具体代码,随便写的:
using AngleSharp.Parser.Html;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
namespace Crawler
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Hello World!");
//for (int i = 1; i <= 10; i++)
//{
// fuac("https://blog.csdn.net/qq_32688731/article/list/"+i);
// Console.WriteLine(count);
//}
//bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
//db.CSDN_Article.ToList().ForEach(r =>
//{
// ff(r.Link, r.Id);
// System.Threading.Thread.Sleep(500);
//});
f("https://user.qzone.qq.com/1439084907", -1);
Console.ReadLine();
}
static int count = 0;
static void f(string url, int id)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll("body")
.Select(t => t)
.ToList();
}
}
static void ff(string url,int id)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll("#main")
.Select(t => new details_itme()
{
ArticleType = t.QuerySelectorAll(".subItem_t a").Length == 0 ? "-1": t.QuerySelectorAll(".subItem_t a").FirstOrDefault().GetAttribute("href"),
ArticleContent = t.QuerySelectorAll(".article_content").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""),
ArticleDetails = t.QuerySelectorAll("#article_details").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""),
})
.ToList();
bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
foreach (var item in data)
{
CSDN_Details cSDN_Details = new CSDN_Details();
//https://blog.csdn.net/qq_32688731/article/category/6568994
string temp = item.ArticleType.Substring(item.ArticleType.LastIndexOf("/")+1);
cSDN_Details.ArticleType =int.Parse(temp);
cSDN_Details.ArticleContent = item.ArticleContent;
cSDN_Details.ArticleDetails = item.ArticleDetails;
cSDN_Details.ArticleListId = id;
db.CSDN_Details.Add(cSDN_Details);
count++;
Console.WriteLine(count);
}
db.SaveChanges();
}
}
static void fuac(string url)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll(".list_item")
.Select(t => new list_item()
{
article_type = t.QuerySelectorAll(".ico_type_Original").FirstOrDefault() != null ? 1 : 0,
article_link = t.QuerySelectorAll(".link_title a").FirstOrDefault().GetAttribute("href"),
article_title = t.QuerySelectorAll(".link_title").FirstOrDefault().TextContent,
article_description = t.QuerySelectorAll(".article_description").FirstOrDefault().TextContent,
article_postdate = t.QuerySelectorAll(".link_postdate").FirstOrDefault().TextContent,
article_view = t.QuerySelectorAll(".link_view").FirstOrDefault().TextContent,
article_comments = t.QuerySelectorAll(".link_comments").FirstOrDefault().TextContent,
})
.ToList();
bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
foreach (var item in data)
{
CSDN_Article cSDN_Article = new CSDN_Article();
cSDN_Article.Type = item.article_type;
cSDN_Article.Link = item.article_link.Trim().Replace("\n", "");
cSDN_Article.Title = item.article_title.Trim().Replace("\n", "");
cSDN_Article.Description = item.article_description.Trim().Replace("\n", "");
cSDN_Article.Postdate = Convert.ToDateTime(item.article_postdate.Trim().Replace("\n", ""));
cSDN_Article.ViewCount = Convert.ToInt32(item.article_view.Trim().Replace("\n", "").Replace("阅读(", "").Replace(")", ""));
cSDN_Article.Comments = Convert.ToInt32(item.article_comments.Trim().Replace("\n", "").Replace("评论(", "").Replace(")", ""));
db.CSDN_Article.Add(cSDN_Article);
System.Threading.Thread.Sleep(100);
count++;
}
db.SaveChanges();
}
}
}
class list_item
{
//文章类型 1原创 0转载
public int article_type { get; set; }
//文章连接
public string article_link { get; set; }
//文章标题
public string article_title { get; set; }
//文章描述
public string article_description { get; set; }
//发表时间
public string article_postdate { get; set; }
//阅读次数
public string article_view { get; set; }
//评论次数
public string article_comments { get; set; }
}
class details_itme
{
//文章类别
public string ArticleType { get; set; }
//文章内容
public string ArticleContent { get; set; }
//文章详情
public string ArticleDetails { get; set; }
//列表外键
public string ArticleListId { get; set; }
}
}