简易网络爬虫程序的开发(6)(c#版)

 

新建工程选择windows服务(示例程序的开发平台是VS2008)

工程名称是SpiderServer,将新建工程中的Service1类的类名改成SpiderServer.

在工程中添加安装类Installer1,代码如下:

namespace SpiderServer
{
    [RunInstaller(true)]
    public partial class Installer1 : Installer
    {
        private System.ServiceProcess.ServiceProcessInstaller spInstaller;

        private System.ServiceProcess.ServiceInstaller sInstaller;

        public Installer1()
        {
            InitializeComponent();

            this.spInstaller = new System.ServiceProcess.ServiceProcessInstaller();
            this.sInstaller = new System.ServiceProcess.ServiceInstaller();

            // 设定ServiceProcessInstaller对象的帐号、用户名和密码等信息
            this.spInstaller.Account = System.ServiceProcess.ServiceAccount.LocalSystem;
            this.spInstaller.Username = null;
            this.spInstaller.Password = null;

            // 设定服务名称
            this.sInstaller.ServiceName = "SpiderServer";
            this.sInstaller.Description = "网络爬虫";

            // 设定服务的启动方式
            this.sInstaller.StartType = System.ServiceProcess.ServiceStartMode.Automatic;

            this.Installers.AddRange(new System.Configuration.Install.Installer[] { this.spInstaller, this.sInstaller });
        }
    }
}

引用程序集Spider,现在我们需要为Spider添加职责链处理过程,自定义新类ChainNode,该类继承AbsChain

重写方法Process。完成从HTML中获取所有的超链接,并将有效超链接添加到UrlStack中,同时获取当前HTML中的Title值,将其写入数据表SpiderTable中源码如下:

namespace SpiderServer
{
    class ChainNode : WebSpider.AbsChain
    {
        protected override void Process(string html)
        {
            try
            {
                Regex re = new Regex(@"href=(?<web_url>[/s/S]*?)>|href=""(?<web_url>[/s/S]*?)""|href='(?<web_url>[/s/S]*?)'");
                MatchCollection mc = re.Matches(html);
                foreach (Match m in mc)
                {
                    string url = m.Groups["web_url"].ToString();

                    //去除头部的'与"
                    if ((url.IndexOf("'") == 0) || (url.IndexOf("/"") == 0))
                    {
                        url = url.Remove(0, 1);
                        if (url.IndexOf("'") != -1)
                        {
                            url = url.Remove(url.IndexOf("'"), 1);
                        }
                        if (url.IndexOf("/"") != -1)
                        {
                            url = url.Remove(url.IndexOf("/""), 1);
                        }
                    }
                    if (url.IndexOf(" ") != -1)
                    {
                        url = url.Remove(url.IndexOf(" "));
                    }
                    if (url.IndexOf("http://") != -1)
                    {
                        WebSpider.UrlStack.Instance.Push(url);
                    }
                }
                string title = string.Empty;
                re = new Regex(@"<title[/s/S]*?>(?<title>[/s/S]*?)</title>");
                Match temp = re.Match(html.ToLower());
                title = temp.Groups["title"].ToString();
                if (!string.IsNullOrEmpty(title))
                {
                    AddUrl(this.Url, title);
                }
            }
            catch
            {
            }
        }

        private void AddUrl(string url, string title)
        {
            using (System.Data.SqlClient.SqlConnection conn = new System.Data.SqlClient.SqlConnection())
            {
                conn.ConnectionString = System.Configuration.ConfigurationManager.AppSettings["DB"];
                conn.Open();

                using (System.Data.SqlClient.SqlCommand cmd = conn.CreateCommand())
                {
                    cmd.CommandText = "AddWeb";
                    cmd.CommandType = System.Data.CommandType.StoredProcedure;
                    cmd.Parameters.AddWithValue("@url", url);
                    cmd.Parameters.AddWithValue("@title", title);

                    cmd.ExecuteNonQuery();
                }
            }
        }
    }
}

自定义类MyServer继承AbsThreadManager,重写GetChainHeader方法,告诉程序处理职责链的处理头节点

namespace SpiderServer
{
    class MyServer : WebSpider.AbsThreadManager
    {
        protected override WebSpider.AbsChain GetChainHeader()
        {
            return new ChainNode();
        }
    }
}

完成启动服务与停止服务过程

amespace SpiderServer
{
    public partial class SpiderServer : ServiceBase
    {
        private MyServer server = new MyServer();
        public SpiderServer()
        {
            InitializeComponent();
        }

        protected override void OnStart(string[] args)
        {
            using (SqlConnection conn = new SqlConnection())
            {
                conn.ConnectionString = System.Configuration.ConfigurationManager.AppSettings["DB"];
                conn.Open();
                using (SqlCommand cmd = conn.CreateCommand())
                {
                    cmd.CommandText = "select Url from TempSplider";
                    cmd.CommandType = CommandType.Text;

                    using (SqlDataReader dr = cmd.ExecuteReader())
                    {
                        while (dr.Read())
                        {
                            WebSpider.UrlStack.Instance.Push(dr[0].ToString());
                        }
                    }
                }
            }
            server.Start("");
        }

        protected override void OnStop()
        {
            server.Stop();
            using (SqlConnection conn = new SqlConnection())
            {
                conn.ConnectionString = System.Configuration.ConfigurationManager.AppSettings["DB"];
                conn.Open();
                using (SqlCommand cmd = conn.CreateCommand())
                {
                    cmd.CommandType = CommandType.Text;
                    cmd.CommandText = "delete from TempSplider";
                    cmd.ExecuteNonQuery();
                }
                int count = WebSpider.UrlStack.Instance.Count;
                for (int i = 0; i < count; i++)
                {
                    string url = WebSpider.UrlStack.Instance.Pop();
                    using (SqlCommand cmd = conn.CreateCommand())
                    {
                        cmd.CommandType = CommandType.Text;
                        cmd.CommandText = "insert into TempSplider(Url) values(@url)";
                        cmd.Parameters.AddWithValue("@url", url);
                        cmd.ExecuteNonQuery();
                    }
                }
            }
        }
    }
}

编译工作,生成SpiderServer.exe文件,用.net自带的installutil.exe工作将服务安装好就可以了。记住,每次启动服务前在表TempSplider中都必须有URL记录的,因为程序要从表中装载URL到UrlStack中,工作线程就是通过UrlStack中取URL并获取相应的HTML的.

源码下载地址:http://download.csdn.net/source/460975

评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值