【原创】C# 多线程采集工具（使用 HtmlAgilityPack 工具）

最新推荐文章于 2024-09-21 17:14:40 发布

猴小新

最新推荐文章于 2024-09-21 17:14:40 发布

阅读量3.2k

点赞数 1

分类专栏： .net

.net 专栏收录该内容

56 篇文章 0 订阅

订阅专栏

我们平时或多或少的都需要采集一些网络上面的信息，当时采集的方法会有很多种，为了更高效的采集数据，我们基本上都要使用多线程，采集下来内容，最关键的还是需要分析网页内容，我们可以使用正则来分析网页中的内容，今天我们采集 HtmlAgilityPack 类库。

使用的工具类库包括：HtmlAgilityPack，以及苏飞的一个 HttpHelper 类，开发环境用的 VisualStudio 2008，.NetFramework 2.0，最终结果如图所示：

同时也看到几个最主要的类，这儿采集工厂模式，目的是让扩展更加容易一些，CollectorFactoryManager.cs 代码如下：

 
   using System;
using System.Collections.Generic;

namespace CollectDemo
{
    /// <summary>
    /// 采集工厂管理类
    /// </summary>
    public class CollectorFactoryManager
    {
        private const int initCount = 5;

        private IList<CollectorFactory> factoryList;
        private Action callback;
        private int collectFactoryIndex;

        public CollectorFactoryManager(Action callback)
        {
            this.callback = callback;
            this.factoryList = new List<CollectorFactory>();
            // 可以无限添加
            this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/", this.CollectorFactoryCalback));
            this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/sitehome/p/2", this.CollectorFactoryCalback));
        }

        // 开始采集
        public void Run()
        {
            this.collectFactoryIndex = -1;
            // 因为线程有最大上限，设置初始采集数量
            for (int index = 0; index < initCount && index < this.factoryList.Count; index++)
            {
                this.CollectorFactoryData();
            }
        }

        private void CollectorFactoryData()
        {
            lock (this)
            {
                this.collectFactoryIndex++;

                //采集未结束，顺序采集
                if (this.collectFactoryIndex < this.factoryList.Count)
                {
                    CollectorFactory collectorFactory = this.factoryList[this.collectFactoryIndex];
                    collectorFactory.Run();
                }
                else
                {
                    // 采集结束
                    this.End();
                }
            }
        }

        public void CollectorFactoryCalback()
        {
            this.CollectorFactoryData();
        }

        /// <summary>
        /// 采集结束
        /// </summary>
        public void End()
        {
            if (this.callback != null) this.callback();
        }
    }
} 
   
 
  
 
   CollectorFactory.cs 代码如下： 
   
 
  
 
   using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;

namespace CollectDemo
{
    public class CollectorFactory
    {
        private const int initCount = 10;
        protected string htmlText;
        protected string urlPath;
        protected IList<CollectorItem> collectorItemList;
        protected Action callback;
        protected int collectItemIndex;

        public CollectorFactory(string urlPath, Action callback)
        {
            this.urlPath = urlPath;
            this.callback = callback;
        }

        /// <summary>
        /// 启动采集
        /// </summary>
        public virtual void Run()
        {
            // 添加睡眠，避免请求被当成爬虫
            int sleepData = new Random().Next(1000, 3000);
            Thread.Sleep(sleepData);

            Thread thread = new Thread(new ThreadStart(this.Start));
            thread.Start();
        }

        /// <summary>
        /// 开启线程
        /// </summary>
        protected virtual void Start()
        {
            this.CreateAndGetHtmlContent();
            this.AnalysisHtmlContent();
            this.CollectorPageData();
        }

        /// <summary>
        /// 创建采集请求信息
        /// </summary>
        protected virtual void CreateAndGetHtmlContent()
        {
            
        }

        /// <summary>
        /// 分析采集数据
        /// </summary>
        protected virtual void AnalysisHtmlContent()
        {
            
        }

        protected virtual void CollectorPageData()
        {
            this.collectItemIndex = -1;
            if (this.collectorItemList != null && this.collectorItemList.Count > 0)
            {
                for (int index = 0; index < initCount && index < this.collectorItemList.Count; index++)
                {
                    this.CollectorItemData();
                }
            }
        }

        public virtual void CollectorItemData()
        {
            lock (this)
            {
                this.collectItemIndex++;

                if (this.collectItemIndex < this.collectorItemList.Count)
                {
                    CollectorItem collectorItem = this.collectorItemList[this.collectItemIndex];
                    collectorItem.Run();
                }
                else
                {
                    // 采集结束
                    this.End();
                }
            }
        }

        public void CollectorItemCalback()
        {
            this.CollectorItemData();
        }

        public virtual void End()
        {
            if (this.callback != null) this.callback();
        }
    }
} 
   CollectorItem.cs 代码如下： 
   
 
  
 
    
   using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;

namespace CollectDemo
{
    public class CollectorItem
    {
        protected string htmlText;
        protected CollectorFactory collectorFactory;
        protected string urlPath;
        protected Action callback;

        public CollectorItem(CollectorFactory collectorFactory, string urlPath, Action callback)
        {
            this.collectorFactory = collectorFactory;
            this.urlPath = urlPath;
            this.callback = callback;
        }

        public void Run()
        {
            // 添加睡眠，避免请求被当成爬虫
            int sleepData = new Random().Next(2000, 6000);
            Thread.Sleep(sleepData);

            Thread thread = new Thread(new ThreadStart(this.Start));
            thread.Start();
        }

        /// <summary>
        /// 开启线程
        /// </summary>
        protected virtual void Start()
        {
            this.CreateAndGetHtmlContent();
            this.AnalysisHtmlContent();
        }

        /// <summary>
        /// 创建采集请求信息
        /// </summary>
        protected virtual void CreateAndGetHtmlContent()
        {

        }

        /// <summary>
        /// 分析采集数据
        /// </summary>
        protected virtual void AnalysisHtmlContent()
        {

        }

        public virtual void End()
        {
            if (this.callback != null) this.callback();
        }
    }
} 
   本例子采集的是博客园的前两页数据，所以我们需要一个解析两页数据链接的 CollectorFactoryOne.cs 类，代码如下： 
   
 
  
 
    
   using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;

namespace CollectDemo
{
    public class CollectorFactoryOne : CollectorFactory
    {
        public CollectorFactoryOne(string urlPath, Action callback) : base(urlPath, callback)
        {
            
        }

        protected override void CreateAndGetHtmlContent()
        {
            HttpItem httpItem = new HttpItem();
            httpItem.URL = this.urlPath;
            httpItem.Method = "get";
            httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";
            httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

            HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);

            this.htmlText = httpResult.Html;
        }

        protected override void AnalysisHtmlContent()
        {
            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(this.htmlText);

            this.collectorItemList = new List<CollectorItem>();
            HtmlNodeCollection hrefList = htmlDocument.DocumentNode.SelectNodes("//a[@class = 'titlelnk']");
            if (hrefList != null)
            {
                foreach (HtmlNode hrefNode in hrefList)
                {
                    HtmlAttribute htmlAttribute = hrefNode.Attributes["href"];
                    this.collectorItemList.Add(new CollectorItemOne(this, htmlAttribute.Value, this.CollectorItemCalback));
                }
            }
        }
    }
} 
   还有一个解析博客园每页内容的 CollectorItemOne.cs 类，代码如下： 
   
 
  
 
    
   using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
using System.IO;

namespace CollectDemo
{
    public class CollectorItemOne : CollectorItem
    {
        public CollectorItemOne(CollectorFactory collectorFactory, string urlPath, Action callback)
            : base(collectorFactory, urlPath, callback)
        {
        }

        protected override void CreateAndGetHtmlContent()
        {
            HttpItem httpItem = new HttpItem();
            httpItem.URL = this.urlPath;
            httpItem.Method = "get";
            httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";
            httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

            HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);
            this.htmlText = httpResult.Html;
        }

        protected override void AnalysisHtmlContent()
        {
            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(this.htmlText);

            lock (this)
            {
                string htmlTitle = htmlDocument.DocumentNode.SelectSingleNode("//title").InnerText;

                // 这儿创建文件

                string filePath = System.Windows.Forms.Application.StartupPath + "\\txt\\";
                filePath += System.Guid.NewGuid() + ".txt";

                if (File.Exists(filePath)) return;
                File.Create(filePath).Close();

                try
                {
                    using (StreamWriter streamWriter = new StreamWriter(filePath, true, System.Text.Encoding.UTF8))
                    {
                        streamWriter.Write(htmlDocument.DocumentNode.InnerHtml);
                        streamWriter.Flush();
                        streamWriter.Close();
                    }
                }
                catch (Exception ex)
                {
                    // 处理错误
                }

                // 处理结束，这儿必须调用
                this.End();
            }
        }
    }
} 
   
 
  
 
   主要的多线程操作都已经封装好，只需要处理采集以及解析网页内容就可以实现快速扩展了。