目录
一、选择两个招聘网站。招聘网站包括:智联招聘、前程无忧、应届生求职、拉勾、中华英才网。
招聘网站:前程无忧、拉钩网
1、网页分析
在前程无忧网页保存网页的网址,查看此网页的源码;前程无忧链接
2、分析出要想爬取我们想要的数据,F12分析这些数据在网页里面的规律。
爬取字段:职位名称、薪资水平、招聘单位、工作地点、工作经验、学历要求、工作内容(岗位职责)、任职要求(技能要求)
1、 利用scrapy搭建爬虫框架
2、编写spiders代码
# -*- coding: utf-8 -*-
import scrapy
class QcSpider(scrapy.Spider):
name = 'qc'
allowed_domains = ['51job.com']
x = 0
def start_requests(self):
job = input("请输入需要爬取的职位:")
c = int(input('输入你想爬取的页数:'))
for x in range(1,c):
url = "https://search.51job.com/list/020000%252C030200%252C090200%252C010000%252C040000,000000,0000,00,9,99,"+ job +",2,"+ str(x) + ".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
yield scrapy.Request(url,callback = self.parse)
def parse(self, response):
selectors = response.xpath('//div[@class="el"]')
for selector in selectors:
url = selector.xpath('./p/span/a/@href').get()
if url:
yield scrapy.Request(url,callback=self.parseDatail)
def parseDatail(self,response):
x=self.x+1
print('正在爬取第'+str(x)+'条数据.........')
self.x = x
'''
这个函数 用来处理详情页的数据
:param response:详情页的结果
:return:详情页提取的数据
'''
job_name=response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/text()').get(default='')
qinxi=response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').get(default='')
s = ''.join(qinxi)
qinxis = s.split('-')
try:
job_gongzi = qinxis[1]
except:
job_gongzi = ''
job_danwei = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/text()').get(default='')
job_dizhi = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()[1]').get(default='')
job_jingyan=response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()[2]').get(default='')
job_xueli = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()[3]').get(default='')
re = response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div/*/text()').getall()
s=