爬取前程无忧网站
爬虫基本思路
获取数据在网页的位置----编写防爬-----启动数据库保存数据
本次教程使用的是mongodb数据库(MySQL)原理差不多,自己百度吧
ps:本次测试日期是2020/7/4 后面因为网站更新防爬措施无法实现功能本教程不背锅!
1、具体要求:职位名称、薪资水平、招聘单位、工作地点、工作经验、学历要求、工作内容(岗位职责)、任职要求(技能要求)。
spider文件代码
# -*- coding: utf-8 -*-
import scrapy
import re
from ..items import QianchengwuyouItem
class WuyouSpider(scrapy.Spider):
name = 'wuyou'
allowed_domains = ['jobs.51job.com']
start_urls = ['http://jobs.51job.com/']
def parse_details(selfm, response):
# 获取详情页面数据
print("=" * 100)
print(response.url)
item = QianchengwuyouItem()
# 职位名称
item["Job_title"] = response.xpath("//div[@class='cn']/h1/text()").extract_first()
# 薪资水平
item["Pay_level"] = response.xpath("//div[@class='cn']/strong/text()").extract_first()
# 招聘单位
item["Recruitment_unit"] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()
# 工作地点 + 工作经验 + 学历要求....都在//div[@class='cn']/p[2]中
item["Workplace"] = response.xpath("//div[@class='cn']/p[2]/text()[1]").get().replace('\xa0','')
# 工作经验 + 学历要求
all = response.xpath("//div[@class='cn']/p[2]/text()[2]").get().replace('\xa0','')
# 判断工作经验是否存在
if len(all) >= 4:
item["hands_background"] = all
item["Education_requirements"] = response.xpath("//div[@class='cn']/p[2]/text()[3]").get().replace('\xa0','')
if len(item["Education_requirements"]) != 2:
item["Education_requirements"] = None
elif len(all) < 4:
item["hands_background"] = None
item["Education_requirements"] = all
if len(item["Education_requirements"]) != 2:
item["Education_requirements"] = None
# .get().replace('\xa0','')
# item["Workplace"] = item["Workplace"].get(1)
# # 学历要求
# item["Education_requirements"] = response.xpath("//div[@class='cn']/p[2]/text()[3]").get().replace('\xa0','')
# 职位信息包含(工作内容+任职要求+工作经验+学历要求)
item["Career_information"] = response.xpath("//div[@class='bmsg job_msg inbox']/p/text()").extract()
item["Career_information"] = [i.strip() for i in item["Career_information"]]
item["Career_information"] = [i for i in item["Career_information"] if len(i) > 0]
item["Career_information"] = " ".join(item["Career_information"]).replace("\xa0","").replace(",",",")
if (item["Pay_level"]) is None:
item["Pay_level"] = "无"
# 关键字:keyword
item["keyword"] = response.xpath("//div[@class='mt10']//p//a/text()").extract()
yield item
def industry_perse(self, response):
# # 获取该行业下所有职业链接
# all_list = response.xpath("//div[@class='detlist gbox']")
# # 获取全部招聘职位下的所有职业(occupation)链接
# for a in all_list:
# occupation_url = a.xpath(".//span/a/@href").extract_first()
# yield scrapy.Request(
# occupation_url,
# callback=self.parse_details
# )
# 获取当前页面所有职业所在的div
all_list = response.xpath("//div[@class='detlist gbox']//div")
# 计算当前页面获取多少url
url_num = 0
# 遍历获取大数据行业下的所有职业(occupation)链接
for a in all_list:
occupation_url = a.xpath("./p/span/a/@href").extract_first()
yield scrapy