大数据实训

爬取前程无忧网站
获取数据在网页的位置----编写防爬-----启动数据库保存数据
使用mongodb数据库(MySQL)
1、具体要求:职位名称、薪资水平、招聘单位、工作地点、工作经验、学历要求、工作内容(岗位职责)、任职要求(技能要求)。
spider文件代码

import scrapy
import re
from ..items import QianchengwuyouItem


class WuyouSpider(scrapy.Spider):
    name = 'wuyou'
    allowed_domains = ['search.51job.com','jobs.51job.com']
    start_urls = ['https://search.51job.com/list/000000,000000,0000,00,3,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare=']
    def parse(self, response):
        # 获取全国全部招聘职位的链接
        all_list = response.xpath('//*[@id="resultList"]//div[@class="el"]')
        # 获取全部招聘职位下的所有行业链接
        for b in all_list:
            all_url = b.xpath('./p/span/a/@href').extract_first()
            yield scrapy.Request(
                all_url,
                callback=self.parse_details
            )
        print(all_url)
        next_url = response.xpath("//*[@id='resultList']//div[@class='p_in']//li/a[text()='下一页']/@href").extract_first()
        if next_url is not None:
            yield  scrapy.Request(
                next_url,
                callback=self.parse
            )
        elif next_url is None:
            print("结束!")
    def parse_details(self, response):
        # 获取详情页面数据
        print("=" * 100)
        print(response.url)
        print("正在爬取页面数据!")
        item = QianchengwuyouItem()
        # 职位名称
        item["Job_title"] = response.xpath("//div[@class='cn']/h1/text()").extract_first()
        item["Job_title"] = [i.strip() for i in item["Job_title"]]
        item["Job_title"] = [i for i in item["Job_title"] if len(i) > 0]
        item["Job_title"] = "".join(item["Job_title"]).replace(",", ",")
        # 薪资水平
        item["Pay_level"] = response.xpath("//div[@class='cn']/strong/text()").extract_first()
        # 招聘单位
        item["Recruitment_unit"] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()
        # 工作地点 + 工作经验 + 学历要求....都在//div[@class='cn']/p[2]中
        item["Workplace"] = response.xpath("//div[@class='cn']/p[2]/text()[1]").get().replace('\xa0','')
        # 工作经验 + 学历要求
        item["hands_background"] = response.xpath("//div[@class='cn']/p[2]/text()").extract()
        item["hands_background"] = [i.strip() for i in item["hands_background"]]
        item["hands_background"] = [i for i in item["hands_background"] if "经验" in i]
        item["hands_background"] = " ".join(item["hands_background"]).replace("\xa0", "")
        if len(item["hands_background"]) == 0:
            item["hands_background"] = "无"
        all = response.xpath("//div[@class='cn']/p[2]/text()[2]").get().replace('\xa0','')
        # 判断工作经验是否存在
        if len(all) >= 4:
            item["Education_requirements"] = response.xpath("//div[@class='cn']/p[2]/text()[3]").get().replace('\xa0','')
            if len(item["Education_requirements"]) != 2:
                item["Education_requirements"] = "无"
        elif len(all) < 4:
            item["Education_requirements"] = "无"
        elif len(all) == 2:
            item["Education_requirements"] = all
        item["Career_information"] = response.xpath("//div[@class='bmsg job_msg inbox']//text()").extract()
        item["Career_information"] = [i.strip() for i in item["Career_information"]]
        item["Career_information"] = [i for i in item["Career_information"] if len(i) > 0]
        item["Career_information"] = " ".join(item["Career_information"]).replace("\xa0","").replace(",",",")
        if (item["Pay_level"]) is None:
            item["Pay_level"] = "无"
        # 关键字:keyword
        item["keyword"] = response.xpath("//div[@class='mt10']//p//a/text()").extract()
        item["keyword"] = [i for i in item["keyword"] if len(i) > 0]
        item["keyword"] = " ".join(item["keyword"]).replace("\xa0", "").replace(",", ",")
        # 日期
        item["day"] = response.xpath("//div[@class='cn']/p[2]/@title").get().replace("\xa0","")
        riqi = re.findall("(\d+-\d+)发布", response.text)[0]
        item["day"] = riqi
        yield item
        print("数据网页爬取成功!")

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class QianchengwuyouItem(scrapy.Item):
    # 职位名称
    Job_title = scrapy.Field()
    # 薪资水平
    Pay_level = scrapy.Field()
    # 招聘单位
    Recruitment_unit = scrapy.Field()
    # 工作地点
    Workplace = scrapy.Field()
    # 工作经验
    hands_background = scrapy.Field()
    # 学历要求
    Education_requirements = scrapy.Field()
    # 职位信息(工作内容+任职要求+工作经验)
    Career_information = scrapy.Field()
    # 关键字:keyword
    keyword = scrapy.Field()
    # 日期
    day = scrapy.Field()

pipelines.py

# -*- coding: utf
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值