C# 网络爬虫

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Data.SqlClient;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using DataCollect.Model;
using HtmlAgilityPack;//引入插件
using Dapper;//引入插件

namespace DataCollect.UI
{
    public partial class MainForm1 : DevExpress.XtraEditors.XtraForm
    {
        CancellationTokenSource cts = new CancellationTokenSource();
        List<Job> list = new List<Job>();
        public MainForm1()
        {
            InitializeComponent();
        }

        private void MainForm1_Load(object sender, EventArgs e)
        {

        }

        private void simpleButton1_Click(object sender, EventArgs e)
        {
            //线程与线程之前不能直接访问(线程是独立的任务片段)
            Task task1 = new Task(Collect);//创建一个线程任务
            task1.Start();
        }

        /// <summary>
        /// 数据采集
        /// </summary>
        private void Collect()
        {
            int index = 1;
            int pageCount=0;
            do
            {
                //HtmlAgilityPack.HtmlDocument htmlDocument=DataLoad(index);
                var htmlDocument = DataLoad(index);
                if (index == 1)
                {
                    //根据查询节点信息,获取某个节点
                    HtmlNode htmlNode = htmlDocument.DocumentNode.SelectSingleNode("//input[@id='hidTotalPage']");//标答节点//根节点
                    pageCount = int.Parse(htmlNode.Attributes["value"].Value);

                    主进程、>一对多主关系
                    //this.Invoke(new Action(() =>
                    //{ this.labelControl6.Text = pageCount.ToString();
                    //}));
                    SetLable(this.labelControl6,pageCount.ToString());
                    //labelControl6.Text = index.ToString();
                }
                //找到了标题
                HtmlNode hn=htmlDocument.DocumentNode.SelectSingleNode("//div[@class='el title']");
                //调用递归
                GetNode(hn.NextSibling.NextSibling);

                //换行符也算标签
                if (cts.IsCancellationRequested) return;

                //基础信息循环
                index++;

                //采集到第几页
                SetLable(this.labelControl4, index.ToString());
                
            } while (index< pageCount);
        }

        /// <summary>
        /// 递归检测
        /// </summary>
        /// <param name="parentNode"></param>
        private void GetNode(HtmlNode parentNode)
        {
            if (cts.IsCancellationRequested) return;//判断取消,终止代码
            Thread.Sleep(100);//休眠0.1秒
            if (parentNode != null && parentNode.ChildNodes.Count > 0)
            {
                Job job = new Job();
                job.Position = parentNode.ChildNodes[1].InnerText.Replace("\r\n","").Trim();
                job.Company = parentNode.ChildNodes[3].InnerText;
                job.Address = parentNode.ChildNodes[5].InnerText;
                job.Salary = parentNode.ChildNodes[7].InnerText;
                job.SendTime = parentNode.ChildNodes[9].InnerText;
                list.Add(job);//每次采集一条累加

                SetLable(labelControl2,list.Count.ToString());

                //处理串线程
                this.listView1.Invoke(new Action(() =>
                {
                    ListViewItem item = new ListViewItem();//数据行
                    item.Text = job.Position;//第一个值
                    item.SubItems.AddRange(new string[] { 
                    job.Company,
                    job.Address,
                    job.Salary,
                    job.SendTime
                    });
                    this.listView1.Items.Add(item);//数据控件
                }));
            }

            //打到一个可以结束的条件
            if (parentNode.NextSibling.Attributes["class"] != null &&
                parentNode.NextSibling.Attributes["class"].Value == "dw_page")
                return;//不要再找了
            GetNode(parentNode.NextSibling);//继续往下

        }

        /// <summary>
        /// 通过线程中Lable赋值
        /// </summary>
        /// <param name="lbl"></param>
        /// <param name="value"></param>
        private void SetLable(DevExpress.XtraEditors.LabelControl lbl, string value)
        {
            lbl.Invoke(new Action(() => { lbl.Text = value; })) ;
        }
        /// <summary>
        /// 根据url地址解析HTML字符串
        /// </summary>
        /// <param name="index"></param>
        /// <returns></returns>
        private HtmlAgilityPack.HtmlDocument DataLoad(int index)
        {
            HtmlWeb htmWeb = new HtmlWeb();

            htmWeb.OverrideEncoding = Encoding.GetEncoding("gbk");//动态字符集
            //采集别人网站时(简体中文网站:gb2312 gbk utf)
            string url = $"https://search.51job.com/list/000000,000000,0000,00,9,99,.net,2,{index}.html";
            return htmWeb.Load(url);
        }

        /// <summary>
        /// 取消事件
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void simpleButton2_Click(object sender, EventArgs e)
        {
            cts.Cancel();//取消当前任务
        }

        /// <summary>
        /// 导出数据
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void simpleButton3_Click(object sender, EventArgs e)
        {
            using (SqlConnection conn = new SqlConnection("server=.;uid=sa;pwd=adminsystem;database=QCSJ"))
            {
                string sql = @"insert into Job(Position,Company,Address,Salary,SendTime) values(@Position,@Company,@Address,@Salary,@SendTime)";
                int result=conn.Execute(sql, list);
                if (result > 0)
                {
                    MessageBox.Show("采集完毕!!");
                }
            }
        }
    }
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace DataCollect.Model
{
    public class Job
    {
        /// <summary>
        /// id
        /// </summary>
        public int id { get; set; }
        /// <summary>
        /// 职位
        /// </summary>
        public string Position { get; set; }
        /// <summary>
        /// 公司
        /// </summary>
        public string Company { get; set; }
        /// <summary>
        /// 地址
        /// </summary>
        public string Address { get; set; }
        /// <summary>
        /// 薪资
        /// </summary>
        public string Salary { get; set; }
        /// <summary>
        /// 发布时间
        /// </summary>
        public string SendTime { get; set; }
    }
}
USE master
GO
IF EXISTS(SELECT * FROM SYS.DATABASES WHERE NAME='QCSJ')
DROP DATABASE QCSJ
GO
CREATE DATABASE QCSJ
ON PRIMARY
(
	NAME=QCSJ_DATA,
	FILENAME='E:\DataBase_Source\QCSJ_DATA.mdf',
	SIZE=5MB,
	MAXSIZE=1000MB,
	FILEGROWTH=15%
)
LOG ON
(
	NAME=QCSJ_LOG,
	FILENAME='E:\DataBase_Source\QCSJ_LOG.ldf',
	SIZE=5MB,
	FILEGROWTH=15%
)
GO

 

USE QCSJ
GO
IF EXISTS(SELECT * FROM SYS.OBJECTS WHERE NAME='Job')
DROP TABLE Job
GO
CREATE TABLE Job
(
	id int identity(1,1) primary key,
	Position varchar(50) not null,--------------职位
	Company varchar(50) not null,---------------公司
	Address varchar(100)not null,---------------地址
	Salary varchar(30)not null,-----------------薪资
	SendTime varchar(30)not null--------------发布时间
)
GO

  • 5
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值