以下代码是一个综合了拉勾网与前程无忧网招聘信息爬取功能的爬虫,讲解起来比较复杂,懂的自然懂,直接放代码:
"""
关于拉勾网和前程无忧网的爬虫
作者:jc
时间:2020.7.17
"""
import time
import configparser
import bs4
import csv
import requests
from lxml import etree
import threading
import random
import time
import datetime
import pymysql
from queue import Queue
from threading import Thread
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from DBUtils.PooledDB import PooledDB
# 关闭安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
i=1
class Job51Spider:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/65.0.3325.181 Safari/537.36",
}
jobs = ['软件', '图像', '自然语言处理', '人工智能', '学习', '前端', '后端', '数据', '算法', '测试', '网络安全', '运维', 'UI', '区块链', '网络', '全栈',
'硬件', 'Java', 'C++', 'PHP', 'C#', '.NET', 'Hadoop', 'Python', 'Perl', 'Ruby', 'Nodejs', 'Go',
'Javascript',
'Delphi', 'jsp', 'sql']
citys = ['广州','上海','北京', '深圳', '成都', '南宁','合肥','杭州', '南京', '苏州', '西安', '长沙', '重庆','东莞', '无锡', '福州', '大连', '宁波','武汉',
'郑州', '济南', '天津', '佛山', '昆山', '沈阳', '青岛', '珠海', '厦门', '昆明', '南昌', '常州', '中山', '惠州', '长春', '哈尔滨',
'嘉兴', '石家庄', '贵阳', '南通', '张家港', '兰州', '海口', '江门', '温州', '徐州', '扬州', '太原', '烟台', '镇江', '泉州', '唐山', '绵阳',
'太仓', '洛阳', '金华', '台州', '湖州', '柳州', '威海', '芜湖', '义乌', '保定', '泰州', '秦皇岛', '咸阳', '株洲', '韶关', '常熟', '澳门',
'湘潭', '宜昌', '香港', '盐城', '潍坊', '襄阳', '绍兴', '马鞍山', '三亚', '汕头', '宿迁', '鹰潭', '乌鲁木齐', '连云港', '呼和浩特', '德阳',
'岳阳',
'靖江', '延安', '莆田', '新乡', '桂林', '盘锦', '鄂州', '滁州', '玉林', '黄石', '邢台', '云浮', '大理', '九江', '自贡', '济宁', '漳州',
'揭阳',
'银川', '梅州', '鄂尔多斯', '宜春', '上饶', '鞍山', '枣庄', '六安', '荆门', '赣州', '龙岩', '西宁', '孝感', '德州', '南平', '泰安', '菏泽',
'阜阳', '拉萨', '清远', '宿州', '丽水', '铜陵', '湛江', '沧州', '黄山', '阿克苏', '舟山', '安庆', '临沂', '衢州', '南阳', '肇庆', '随州',
'吉安', '兴安盟', '萍乡', '攀枝花', '承德', '上海']
def run(self):
print("开始爬取")
conf = configparser.ConfigParser()
conf.read('C:/Users/Administrator/source/repos/51job爬虫/51job爬虫/Spider51/conf.ini')
for city in self.citys:
for job in self.jobs:
citycode = conf['citycode'][city]
print("获取到的城市代码为", citycode)
page = 1
# 获得总页数
url = "https://search.51job.com/list/{},000000,0000,00,9,99,{},2," \
"{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99" \
"&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line" \
"=&specialarea=00&from=&welfare=".format(citycode, job, page)
a = requests.get(url=url, headers=self.headers)
a.encoding = 'gbk'
try:
html = etree.HTML(a.text)
maxpage = html.xpath('//*[@id="resultList"]/div[2]/div[5]/text()')[2].replace('/', '').strip()
maxpage = eval(maxpage)
# 解析页数
while True:
url = "https://search.51job.com/list/{},000000,0000,00,9,99,{},2," \
"{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99" \
"&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line" \
"=&specialarea=00&from=&welfare=".format(citycode, job, page)
self.get_urls(url)
print('多线程+' + str(page) + '页完成--' + city + job)
page = page + 1
if page == maxpage + 1:
break
except:
pass
def get_urls(self, url):
try:
a = requests.get(url=url, headers=self.headers)
a.encoding = 'gbk'
html = etree.HTML(a.content)
urls = html.xpath('//*[@id="resultList"]/div[@class="el"]/p/span/a')
for i in urls:
t = threading.Thread(target=self.get_job_detail