scrapy 爬取前程无忧让找工作变得轻而易举

最新推荐文章于 2021-07-23 14:40:18 发布

小东升职记

最新推荐文章于 2021-07-23 14:40:18 发布

阅读量652

点赞数 3

分类专栏： python 文章标签： scrapy 爬取前程无忧让找工作变得轻而易举

本文链接：https://blog.csdn.net/qq_38704184/article/details/101775158

版权

python 专栏收录该内容

21 篇文章 2 订阅

订阅专栏

# -*- coding: utf-8 -*-
import scrapy
from qcwy.items import QcwyItem
from urllib import parse
import re


class A51jobSpider(scrapy.Spider):
    name = '51job'
    allowed_domains = ['51job.com']
    keyword = "python开发工程师" # 此地方可以灵活更换
    kw = parse.quote(parse.quote(keyword))
    base_url = "https://search.51job.com/list/020000,000000,0000,00,9,99,{0},2,{1}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
    offset = 1
    start_urls = [base_url.format(kw, offset)]

    def parse(self, response):
        if response.body is not None:
            string = response.xpath('//span[@class="td"]').extract()[0]
            pattern = ".*?(\d+).*?"
            count = re.match(pattern, string).groups()[0]
            for data in response.xpath('//div[@class="el"]'):
                item = QcwyItem()
                item['position'] = data.xpath('./p/span/a/text()').extract()
                if len(item['position']) != 0:
                    item['position'] = str(item['position'][0]).replace('\r\n', "").strip()
                    item['position_href'] = data.xpath('./p/span/a/@href').extract()[0]
                    item['company'] = data.xpath('./span/a/text()').extract()[0]
                    item['company_href'] = data.xpath('./span/a/@href').extract()[0]
                    item['workplace'] = data.xpath('./span[@class="t3"]/text()').extract()[0]
                    item['pay'] = data.xpath('./span[@class="t4"]/text()').extract()
                    if len(item['pay']) != 0:
                        item['pay'] = item['pay'][0]
                    else:
                        item['pay'] = ""
                    item['release_time'] = data.xpath('./span[@class="t5"]/text()').extract()[0]
                    yield scrapy.Request(url=item['position_href'], callback=self.parse_info,
                                         meta={"iteminfo": item},
                                         headers={
                                             'referer': item['position_href']
                                         })
                    # yield item
            self.offset += 1
            if self.offset > int(count):
                return
            yield scrapy.Request(self.base_url.format(self.kw, self.offset), callback=self.parse, headers={
                'referer': self.base_url.format(self.kw, self.offset)
            })

    def parse_info(self, response):
        """
        处理详细信息
        :param response:
        :return:
        """
        item = response.meta["iteminfo"]
        position_ask = response.xpath('//div[@class="cn"]/p[@class="msg ltype"]/text()').extract()
        item['position_ask'] = " ".join(str(x).replace('\\s', "").strip() for x in position_ask if x is not None)
        position_welfare = response.xpath('//div[@class="cn"]//div[@class="t1"]/span/text()').extract()
        item['position_welfare'] = " ".join(
            str(x).replace('\\s', "").strip() for x in position_welfare if x is not None)
        position_info = \
            response.xpath('//div[@class="tCompany_main"]//div[@class="bmsg job_msg inbox"]//text()').extract()
        item['position_info'] = " ".join(str(x).replace('\\s', "").strip() for x in position_info if x is not None)
        position_el = response.xpath(
            '//div[@class="tCompany_main"]/div[@class="tBorderTop_box"]/div[@class="bmsg inbox"]//text()').extract()
        item['position_el'] = " ".join(str(x).replace('\\s', "").strip() for x in position_el if x is not None)
        yield item

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class QcwyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 职位
    position = scrapy.Field()
    # 职位链接
    position_href = scrapy.Field()
    # 职位要求
    position_ask = scrapy.Field()
    # 福利
    position_welfare = scrapy.Field()
    # 职位信息
    position_info = scrapy.Field()
    # 职位其他信息
    position_el = scrapy.Field()
    # 公司
    company = scrapy.Field()
    # 公司链接
    company_href = scrapy.Field()
    # 工作地点
    workplace = scrapy.Field()
    # 薪资
    pay = scrapy.Field()
    # 发布时间
    release_time = scrapy.Field()

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient


class QcwyPipeline(object):

    def process_item(self, item, spider):
        client = MongoClient(host="127.0.0.1", port=27017)
        db = client['qcwy']
        col = db['python'] # collections可以灵活改变
        col.insert_one(dict(item))
        return item

# -*- coding: utf-8 -*-

# Scrapy settings for qcwy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'qcwy'

SPIDER_MODULES = ['qcwy.spiders']
NEWSPIDER_MODULE = 'qcwy.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'qcwy.middlewares.QcwySpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'qcwy.middlewares.QcwyDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'qcwy.pipelines.QcwyPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'