需求
需求:
1、 遍历首页所有职位分类
2、 点击进入职位分类详情页,按照地区抓取,职位名称,月薪,经验年限要求,学历要求,招聘公司,所属行业,轮次,人数(规模),发布时间
3、 点击进入职位详情页,抓取该职位的技能标签。
代码
代码有注释,没有代理慎用
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-06 10:40:07
# Project: boss_recruit
from pyspider.libs.base_handler import *
import re
import datetime
from pymongo import MongoClient
# 连接线下数据库
# admin 数据库有帐号,连接-认证-切换
DB_NAME = 'research'
DB_COL = 'boss_recruit'
db = client[DB_NAME]
col = db[DB_COL]
class Handler(BaseHandler):
crawl_config = {
"headers":{
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
},
"proxy": "http://localhost:6666"
}
url = 'https://www.zhipin.com/?ka=header-home'
def format_date(self, date):
return datetime.datetime.strptime(date, '%Y%m%d')
@every(minutes=24 * 60)
def on_start(self):
print(get_proxy())
self.crawl(self.url, callback=self.index_page, proxy=get_proxy())
@config(age=60)
def index_page(self, response):
page = response.etree
base_url = 'https://www.zhipin.com'
# 所有行业列表
vocation_list = page.xpath("//div[@class='job-menu']//div[@class='menu-sub']/ul/li")
for each in vocation_list:
belong = each.xpath("./h4/text()")[0]
detail_list = each.xpath("./div[@class='text']/a")
print(belong)
for detail in detail_list:
detail_title = detail.xpath("./text()")[0]
detail_url = base_url + detail.xpath("./@href")[0]
#save = {"belonging":[belong, detail_title]}
save = {
"belonging": detail_title}
print(detail_title, detail_url)
self.crawl(detail_url, callback=self.detail_page, save=save, proxy=get_proxy())
@config(age=60)
def