我们平时或多或少的都需要采集一些网络上面的信息,当时采集的方法会有很多种,为了更高效的采集数据,我们基本上都要使用多线程,采集下来内容,最关键的还是需要分析网页内容,我们可以使用正则来分析网页中的内容,今天我们采集 HtmlAgilityPack 类库。
使用的工具类库包括:HtmlAgilityPack,以及苏飞的一个 HttpHelper 类,开发环境用的 VisualStudio 2008,.NetFramework 2.0,最终结果如图所示:
同时也看到几个最主要的类,这儿采集工厂模式,目的是让扩展更加容易一些,CollectorFactoryManager.cs 代码如下:
using System; using System.Collections.Generic; namespace CollectDemo { /// <summary> /// 采集工厂管理类 /// </summary> public class CollectorFactoryManager { private const int initCount = 5; private IList<CollectorFactory> factoryList; private Action callback; private int collectFactoryIndex; public CollectorFactoryManager(Action callback) { this.callback = callback; this.factoryList = new List<CollectorFactory>(); // 可以无限添加 this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/", this.CollectorFactoryCalback)); this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/sitehome/p/2", this.CollectorFactoryCalback)); } // 开始采集 public void Run() { this.collectFactoryIndex = -1; // 因为线程有最大上限,设置初始采集数量 for (int index = 0; index < initCount && index < this.factoryList.Count; index++) { this.CollectorFactoryData(); } } private void CollectorFactoryData() { lock (this) { this.collectFactoryIndex++; //采集未结束,顺序采集 if (this.collectFactoryIndex < this.factoryList.Count) { CollectorFactory collectorFactory = this.factoryList[this.collectFactoryIndex]; collectorFactory.Run(); } else { // 采集结束 this.End(); } } } public void CollectorFactoryCalback() { this.CollectorFactoryData(); } /// <summary> /// 采集结束 /// </summary> public void End() { if (this.callback != null) this.callback(); } } }
CollectorFactory.cs 代码如下:
using System; using System.Collections.Generic; using System.Threading; using HtmlAgilityPack; namespace CollectDemo { public class CollectorFactory { private const int initCount = 10; protected string htmlText; protected string urlPath; protected IList<CollectorItem> collectorItemList; protected Action callback; protected int collectItemIndex; public CollectorFactory(string urlPath, Action callback) { this.urlPath = urlPath; this.callback = callback; } /// <summary> /// 启动采集 /// </summary> public virtual void Run() { // 添加睡眠,避免请求被当成爬虫 int sleepData = new Random().Next(1000, 3000); Thread.Sleep(sleepData); Thread thread = new Thread(new ThreadStart(this.Start)); thread.Start(); } /// <summary> /// 开启线程 /// </summary> protected virtual void Start() { this.CreateAndGetHtmlContent(); this.AnalysisHtmlContent(); this.CollectorPageData(); } /// <summary> /// 创建采集请求信息 /// </summary> protected virtual void CreateAndGetHtmlContent() { } /// <summary> /// 分析采集数据 /// </summary> protected virtual void AnalysisHtmlContent() { } protected virtual void CollectorPageData() { this.collectItemIndex = -1; if (this.collectorItemList != null && this.collectorItemList.Count > 0) { for (int index = 0; index < initCount && index < this.collectorItemList.Count; index++) { this.CollectorItemData(); } } } public virtual void CollectorItemData() { lock (this) { this.collectItemIndex++; if (this.collectItemIndex < this.collectorItemList.Count) { CollectorItem collectorItem = this.collectorItemList[this.collectItemIndex]; collectorItem.Run(); } else { // 采集结束 this.End(); } } } public void CollectorItemCalback() { this.CollectorItemData(); } public virtual void End() { if (this.callback != null) this.callback(); } } }CollectorItem.cs 代码如下:
using System; using System.Collections.Generic; using System.Threading; using HtmlAgilityPack; namespace CollectDemo { public class CollectorItem { protected string htmlText; protected CollectorFactory collectorFactory; protected string urlPath; protected Action callback; public CollectorItem(CollectorFactory collectorFactory, string urlPath, Action callback) { this.collectorFactory = collectorFactory; this.urlPath = urlPath; this.callback = callback; } public void Run() { // 添加睡眠,避免请求被当成爬虫 int sleepData = new Random().Next(2000, 6000); Thread.Sleep(sleepData); Thread thread = new Thread(new ThreadStart(this.Start)); thread.Start(); } /// <summary> /// 开启线程 /// </summary> protected virtual void Start() { this.CreateAndGetHtmlContent(); this.AnalysisHtmlContent(); } /// <summary> /// 创建采集请求信息 /// </summary> protected virtual void CreateAndGetHtmlContent() { } /// <summary> /// 分析采集数据 /// </summary> protected virtual void AnalysisHtmlContent() { } public virtual void End() { if (this.callback != null) this.callback(); } } }本例子采集的是博客园的前两页数据,所以我们需要一个解析两页数据链接的 CollectorFactoryOne.cs 类,代码如下:
using System; using System.Collections.Generic; using System.Threading; using HtmlAgilityPack; namespace CollectDemo { public class CollectorFactoryOne : CollectorFactory { public CollectorFactoryOne(string urlPath, Action callback) : base(urlPath, callback) { } protected override void CreateAndGetHtmlContent() { HttpItem httpItem = new HttpItem(); httpItem.URL = this.urlPath; httpItem.Method = "get"; httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0"; httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem); this.htmlText = httpResult.Html; } protected override void AnalysisHtmlContent() { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(this.htmlText); this.collectorItemList = new List<CollectorItem>(); HtmlNodeCollection hrefList = htmlDocument.DocumentNode.SelectNodes("//a[@class = 'titlelnk']"); if (hrefList != null) { foreach (HtmlNode hrefNode in hrefList) { HtmlAttribute htmlAttribute = hrefNode.Attributes["href"]; this.collectorItemList.Add(new CollectorItemOne(this, htmlAttribute.Value, this.CollectorItemCalback)); } } } } }还有一个解析博客园每页内容的 CollectorItemOne.cs 类,代码如下:
using System; using System.Collections.Generic; using System.Threading; using HtmlAgilityPack; using System.IO; namespace CollectDemo { public class CollectorItemOne : CollectorItem { public CollectorItemOne(CollectorFactory collectorFactory, string urlPath, Action callback) : base(collectorFactory, urlPath, callback) { } protected override void CreateAndGetHtmlContent() { HttpItem httpItem = new HttpItem(); httpItem.URL = this.urlPath; httpItem.Method = "get"; httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0"; httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem); this.htmlText = httpResult.Html; } protected override void AnalysisHtmlContent() { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(this.htmlText); lock (this) { string htmlTitle = htmlDocument.DocumentNode.SelectSingleNode("//title").InnerText; // 这儿创建文件 string filePath = System.Windows.Forms.Application.StartupPath + "\\txt\\"; filePath += System.Guid.NewGuid() + ".txt"; if (File.Exists(filePath)) return; File.Create(filePath).Close(); try { using (StreamWriter streamWriter = new StreamWriter(filePath, true, System.Text.Encoding.UTF8)) { streamWriter.Write(htmlDocument.DocumentNode.InnerHtml); streamWriter.Flush(); streamWriter.Close(); } } catch (Exception ex) { // 处理错误 } // 处理结束,这儿必须调用 this.End(); } } } }
主要的多线程操作都已经封装好,只需要处理采集以及解析网页内容就可以实现快速扩展了。