爬取前程无忧网站
获取数据在网页的位置----编写防爬-----启动数据库保存数据
使用mongodb数据库(MySQL)
1、具体要求:职位名称、薪资水平、招聘单位、工作地点、工作经验、学历要求、工作内容(岗位职责)、任职要求(技能要求)。
spider文件代码
import scrapy
import re
from ..items import QianchengwuyouItem
class WuyouSpider(scrapy.Spider):
name = 'wuyou'
allowed_domains = ['search.51job.com','jobs.51job.com']
start_urls = ['https://search.51job.com/list/000000,000000,0000,00,3,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare=']
def parse(self, response):
# 获取全国全部招聘职位的链接
all_list = response.xpath('//*[@id="resultList"]//div[@class="el"]')
# 获取全部招聘职位下的所有行业链接
for b in all_list:
all_url = b.xpath('./p/span/a/@href').extract_first()
yield scrapy.Request(
all_url,
callback=self.parse_details
)
print(all_url)
next_url = response.xpath("//*[@id='resultList']//div[@class='p_in']//li/a[text()='下一页']/@href").extract_first()
if next_url is not None:
yield scrapy.Request(
next_url,
callback=self.parse
)
elif next_url is None:
print("结束!")
def parse_details(self, response):
# 获取详情页面数据
print("=" * 100)
print(response.url)
print("正在爬取页面数据!")
item = QianchengwuyouItem()
# 职位名称
item["Job_title"] = response.xpath("//div[@class='cn']/h1/text()").extract_first()
item["Job_title"] = [i.strip() for i in item["Job_title"]]
item["Job_title"] = [i for i in item["Job_title"] if len(i) > 0]
item["Job_title"] = "".join(item["Job_title"]).replace(",", ",")
# 薪资水平
item["Pay_level"] = response.xpath("//div[@class='cn']/strong/text()").extract_first()
# 招聘单位
item["Recruitment_unit"] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()
# 工作地点 + 工作经验 + 学历要求....都在//div[@class='cn']/p[2]中
item["Workplace"] = response.xpath("//div[@class='cn']/p[2]/text()[1]").get().replace('\xa0','')
# 工作经验 + 学历要求
item["hands_background"] = response.xpath("//div[@class='cn']/p[2]/text()").extract()
item["hands_background"] = [i.strip() for i in item["hands_background"]]
item["hands_background"] = [i for i in item["hands_background"] if "经验" in i]
item["hands_background"] = " ".join(item["hands_background"]).replace("\xa0", "")
if len(item["hands_background"]) == 0:
item["hands_background"] = "无"
all = response.xpath("//div[@class='cn']/p[2]/text()[2]").get().replace('\xa0','')
# 判断工作经验是否存在
if len(all) >= 4:
item["Education_requirements"] = response.xpath("//div[@class='cn']/p[2]/text()[3]").get().replace('\xa0','')
if len(item["Education_requirements"]) != 2:
item["Education_requirements"] = "无"
elif len(all) < 4:
item["Education_requirements"] = "无"
elif len(all) == 2:
item["Education_requirements"] = all
item["Career_information"] = response.xpath("//div[@class='bmsg job_msg inbox']//text()").extract()
item["Career_information"] = [i.strip() for i in item["Career_information"]]
item["Career_information"] = [i for i in item["Career_information"] if len(i) > 0]
item["Career_information"] = " ".join(item["Career_information"]).replace("\xa0","").replace(",",",")
if (item["Pay_level"]) is None:
item["Pay_level"] = "无"
# 关键字:keyword
item["keyword"] = response.xpath("//div[@class='mt10']//p//a/text()").extract()
item["keyword"] = [i for i in item["keyword"] if len(i) > 0]
item["keyword"] = " ".join(item["keyword"]).replace("\xa0", "").replace(",", ",")
# 日期
item["day"] = response.xpath("//div[@class='cn']/p[2]/@title").get().replace("\xa0","")
riqi = re.findall("(\d+-\d+)发布", response.text)[0]
item["day"] = riqi
yield item
print("数据网页爬取成功!")
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class QianchengwuyouItem(scrapy.Item):
# 职位名称
Job_title = scrapy.Field()
# 薪资水平
Pay_level = scrapy.Field()
# 招聘单位
Recruitment_unit = scrapy.Field()
# 工作地点
Workplace = scrapy.Field()
# 工作经验
hands_background = scrapy.Field()
# 学历要求
Education_requirements = scrapy.Field()
# 职位信息(工作内容+任职要求+工作经验)
Career_information = scrapy.Field()
# 关键字:keyword
keyword = scrapy.Field()
# 日期
day = scrapy.Field()
pipelines.py
# -*- coding: utf