python爬虫---拉勾网与前程无忧网招聘数据获取(多线程,数据库,反爬虫应对)

以下代码是一个综合了拉勾网与前程无忧网招聘信息爬取功能的爬虫,讲解起来比较复杂,懂的自然懂,直接放代码:

"""
关于拉勾网和前程无忧网的爬虫
作者:jc
时间:2020.7.17
"""
import time
import configparser
import bs4
import csv
import requests
from lxml import etree
import threading
import random
import time
import datetime
import pymysql
from queue import Queue
from threading import Thread
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from DBUtils.PooledDB import PooledDB

# 关闭安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
i=1

class Job51Spider:
    headers = {
   
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/65.0.3325.181 Safari/537.36",
            }
    jobs = ['软件', '图像', '自然语言处理', '人工智能', '学习', '前端', '后端', '数据', '算法', '测试', '网络安全', '运维', 'UI', '区块链', '网络', '全栈',
            '硬件', 'Java', 'C++', 'PHP', 'C#', '.NET', 'Hadoop', 'Python', 'Perl', 'Ruby', 'Nodejs', 'Go',
            'Javascript',
            'Delphi', 'jsp', 'sql']

    citys = ['广州','上海','北京', '深圳', '成都', '南宁','合肥','杭州', '南京', '苏州', '西安', '长沙', '重庆','东莞', '无锡', '福州', '大连', '宁波','武汉',
             '郑州', '济南', '天津', '佛山', '昆山', '沈阳', '青岛', '珠海', '厦门', '昆明', '南昌', '常州', '中山', '惠州', '长春', '哈尔滨',
             '嘉兴', '石家庄', '贵阳', '南通', '张家港', '兰州', '海口', '江门', '温州', '徐州', '扬州', '太原', '烟台', '镇江', '泉州', '唐山', '绵阳',
             '太仓', '洛阳', '金华', '台州', '湖州', '柳州', '威海', '芜湖', '义乌', '保定', '泰州', '秦皇岛', '咸阳', '株洲', '韶关', '常熟', '澳门',
             '湘潭', '宜昌', '香港', '盐城', '潍坊', '襄阳', '绍兴', '马鞍山', '三亚', '汕头', '宿迁', '鹰潭', '乌鲁木齐', '连云港', '呼和浩特', '德阳',
             '岳阳',
             '靖江', '延安', '莆田', '新乡', '桂林', '盘锦', '鄂州', '滁州', '玉林', '黄石', '邢台', '云浮', '大理', '九江', '自贡', '济宁', '漳州',
             '揭阳',
             '银川', '梅州', '鄂尔多斯', '宜春', '上饶', '鞍山', '枣庄', '六安', '荆门', '赣州', '龙岩', '西宁', '孝感', '德州', '南平', '泰安', '菏泽',
             '阜阳', '拉萨', '清远', '宿州', '丽水', '铜陵', '湛江', '沧州', '黄山', '阿克苏', '舟山', '安庆', '临沂', '衢州', '南阳', '肇庆', '随州',
             '吉安', '兴安盟', '萍乡', '攀枝花', '承德', '上海']





    def run(self):
        print("开始爬取")
        conf = configparser.ConfigParser()
        conf.read('C:/Users/Administrator/source/repos/51job爬虫/51job爬虫/Spider51/conf.ini')
        for city in self.citys:
            for job in self.jobs:
                citycode = conf['citycode'][city]
                print("获取到的城市代码为", citycode)
                page = 1
                # 获得总页数
                url = "https://search.51job.com/list/{},000000,0000,00,9,99,{},2," \
                      "{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99" \
                      "&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line" \
                      "=&specialarea=00&from=&welfare=".format(citycode, job, page)
                a = requests.get(url=url, headers=self.headers)
                a.encoding = 'gbk'
                try:
                    html = etree.HTML(a.text)
                    maxpage = html.xpath('//*[@id="resultList"]/div[2]/div[5]/text()')[2].replace('/', '').strip()
                    maxpage = eval(maxpage)
                    # 解析页数
                    while True:
                        url = "https://search.51job.com/list/{},000000,0000,00,9,99,{},2," \
                              "{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99" \
                              "&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line" \
                              "=&specialarea=00&from=&welfare=".format(citycode, job, page)
                        self.get_urls(url)
                        print('多线程+' + str(page) + '页完成--' + city + job)
                        page = page + 1
                        if page == maxpage + 1:
                            break
                except:
                    pass

    def get_urls(self, url):
        try:
            a = requests.get(url=url, headers=self.headers)
            a.encoding = 'gbk'
            html = etree.HTML(a.content)
            urls = html.xpath('//*[@id="resultList"]/div[@class="el"]/p/span/a')
            for i in urls:
                t = threading.Thread(target=self.get_job_detail
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值