原理:
通过HtmlAgilityPack解析html源码得到所需的数据。
1、首先通过http://blog.csdn.net/gdjlc 底部的“xx条数据 共xx页”,获取得总页数;
2、获取每一页的所有文章URL,每一页的URL如下所示: http://blog.csdn.net/gdjlc/article/list/当前页索引,从1一直循环到总页数即可得.
3、获取单个文章的内容。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Diagnostics;
using System.Collections;
using System.Threading;
using System.Threading.Tasks;
using System.Xml.Linq;
using System.Net;
using HtmlAgilityPack;
namespace Demo
{
public partial class FrmCSDN : Form
{
const string BLOGUSER = "gdjlc"; //博客用户名
const string BLOGURL = "http://blog.csdn.net";
const string PAGECOUNTPATH = "//div[@id='papelist']/span[1]";//总页数PATH
const string ARTICLEURLPATH = "//span[@class='link_title']/a[1]"; //文章URL的PATH
const string ARTICLETITLEPATH = "//div[@class='article_title']/h3/span/a";//文章标题PATH
const string POSTDATEPATH = "//span[@class='link_postdate']"; //文章创建日期PATH
const string ARTICLECONTENTPATH = "//div[@id='article_content']"; //文章内容PATH
List<string> articleUrlList = new List<string>(); //所有文章的URL
private object moniter = new object();
Stopwatch stopwatch = n