python爬虫---拉勾网与前程无忧网招聘数据获取（多线程，数据库，反爬虫应对）

最新推荐文章于 2024-08-01 11:20:11 发布

coast_s

最新推荐文章于 2024-08-01 11:20:11 发布

阅读量1.9k

点赞数

分类专栏： python爬虫文章标签： python 大数据

本文链接：https://blog.csdn.net/weixin_43996337/article/details/107407589

版权

以下代码是一个综合了拉勾网与前程无忧网招聘信息爬取功能的爬虫，讲解起来比较复杂，懂的自然懂，直接放代码：

"""
关于拉勾网和前程无忧网的爬虫
作者：jc
时间：2020.7.17
"""
import time
import configparser
import bs4
import csv
import requests
from lxml import etree
import threading
import random
import time
import datetime
import pymysql
from queue import Queue
from threading import Thread
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from DBUtils.PooledDB import PooledDB

# 关闭安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
i=1

class Job51Spider:
    headers = {
   
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/65.0.3325.181 Safari/537.36",
            }
    jobs = ['软件', '图像', '自然语言处理', '人工智能', '学习', '前端', '后端', '数据', '算法', '测试', '网络安全', '运维', 'UI', '区块链', '网络', '全栈',
            '硬件', 'Java', 'C++', 'PHP', 'C#', '.NET', 'Hadoop', 'Python', 'Perl', 'Ruby', 'Nodejs', 'Go',
            'Javascript',
            'Delphi', 'jsp', 'sql']

    citys = ['广州','上海','北京', '深圳', '成都', '南宁','合肥','杭州', '南京', '苏州', '西安', '长沙', '重庆','东莞', '无锡', '福州', '大连', '宁波','武汉',
             '郑州', '济南', '天津', '佛山', '昆山', '沈阳', '青岛', '珠海', '厦门', '昆明', '南昌', '常州', '中山', '惠州', '长春', '哈尔滨',
             '嘉兴', '石家庄', '贵阳', '南通', '张家港', '兰州', '海口', '江门', '温州', '徐州', '扬州', '太原', '烟台', '镇江', '泉州', '唐山', '绵阳',
             '太仓', '洛阳', '金华', '台州', '湖州', '柳州', '威海', '芜湖', '义乌', '保定', '泰州', '秦皇岛', '咸阳', '株洲', '韶关', '常熟', '澳门',
             '湘潭', '宜昌', '香港', '盐城', '潍坊', '襄阳', '绍兴', '马鞍山', '三亚', '汕头', '宿迁', '鹰潭', '乌鲁木齐', '连云港', '呼和浩特', '德阳',
             '岳阳',
             '靖江', '延安', '莆田', '新乡', '桂林', '盘锦', '鄂州', '滁州', '玉林', '黄石', '邢台', '云浮', '大理', '九江', '自贡', '济宁', '漳州',
             '揭阳',
             '银川', '梅州', '鄂尔多斯', '宜春', '上饶', '鞍山', '枣庄', '六安', '荆门', '赣州', '龙岩', '西宁', '孝感', '德州', '南平', '泰安', '菏泽',
             '阜阳', '拉萨', '清远', '宿州', '丽水', '铜陵', '湛江', '沧州', '黄山', '阿克苏', '舟山', '安庆', '临沂', '衢州', '南阳', '肇庆', '随州',
             '吉安', '兴安盟', '萍乡', '攀枝花', '承德', '上海']





    def run(self):
        print("开始爬取")
        conf = configparser.ConfigParser()
        conf.read('C:/Users/Administrator/source/repos/51job爬虫/51job爬虫/Spider51/conf.ini')
        for city in self.citys:
            for job in self.jobs:
                citycode = conf['citycode'][city]
                print("获取到的城市代码为", citycode)
                page = 1
                # 获得总页数
                url = "https://search.51job.com/list/{},000000,0000,00,9,99,{},2," \
                      "{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99" \
                      "&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line" \
                      "=&specialarea=00&from=&welfare=".format(citycode, job, page)
                a = requests.get(url=url, headers=self.headers)
                a.encoding = 'gbk'
                try:
                    html = etree.HTML(a.text)
                    maxpage = html.xpath('//*[@id="resultList"]/div[2]/div[5]/text()')[2].replace('/', '').strip()
                    maxpage = eval(maxpage)
                    # 解析页数
                    while True:
                        url = "https://search.51job.com/list/{},000000,0000,00,9,99,{},2," \
                              "{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99" \
                              "&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line" \
                              "=&specialarea=00&from=&welfare=".format(citycode, job, page)
                        self.get_urls(url)
                        print('多线程+' + str(page) + '页完成--' + city + job)
                        page = page + 1
                        if page == maxpage + 1:
                            break
                except:
                    pass

    def get_urls(self, url):
        try:
            a = requests.get(url=url, headers=self.headers)
            a.encoding = 'gbk'
            html = etree.HTML(a.content)
            urls = html.xpath('//*[@id="resultList"]/div[@class="el"]/p/span/a')
            for i in urls:
                t = threading.Thread(target=self.get_job_detail