基于scrapy爬取 boss内容(制定url模板+手动请求)

本文链接：https://blog.csdn.net/weixin_38507813/article/details/98339285

说明: 代码还有部分问题,目前不知道是什么原因(非封我IP导致)不能爬取多页内容,用方法可爬取其他网站多页信息

爬取内容 python 专栏岗位名称\薪资\公司名称\岗位描述

知识点:

UA伪装

from fake_useragent import UserAgent
USER_AGENT = UserAgent().random

xpath(…).xextract 和extract_first

extract（）将列表中的每一个列表元素表示的selector对象中的字符串进行提取，返回的还是一个列表

extract_first() 将列表中第0个列表元素对应的selector对象中的字符串进行提取，还回一个字符串

目录结构:
在这里插入图片描述
代码内容

settings.py

# -*- coding: utf-8 -*-
from fake_useragent import UserAgent

BOT_NAME = 'bossPro'

SPIDER_MODULES = ['bossPro.spiders']
NEWSPIDER_MODULE = 'bossPro.spiders'


USER_AGENT = UserAgent().random

ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

# DOWNLOADER_MIDDLEWARES = {
#    'bossPro.middlewares.BossproDownloaderMiddleware': 543,
# }

ITEM_PIPELINES = {
   'bossPro.pipelines.BossproPipeline': 300,
}

boss.py 内容

# -*- coding: utf-8 -*-
import scrapy
from bossPro.items import BossproItem


class BossSpider(scrapy.Spider):
    name = 'boss'
    # allowed_domains = ['www.xx.com']
    start_urls = ['https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position=']
    # 通用模板,不可变
    url = 'https://www.zhipin.com/c101010100/?query=python&page=%d'
    page_num = 2

    def parse(self, response):
        # 岗位名称  薪资  公司名称
        li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li')
        for li in li_list:
            job_name = li.xpath('./div/div[1]/h3/a/div[1]/text()').extract_first()
            salary = li.xpath('./div/div[1]/h3/a/span/text()').extract_first()
            company_name = li.xpath('.//div[@class="info-company"]/div/h3/a/text()').extract_first()

            item = BossproItem()  # 将数据封装到item中   再将item对象传递给详情页的请求中

            # 详情页url -> 获取岗位描述
            detail_url = 'https://www.zhipin.com/' + li.xpath('./div/div[1]/h3/a/@href').extract_first()

            # 先把数据放到item中
            item['job_name'] = job_name
            item['salary'] = salary
            item['company_name'] = company_name

            # meta的作用 将meta对应字典传递给callback回调函数
            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})

        # 手动请求发送  定制代码发请求
        if self.page_num <= 3:
            new_url = format(self.url % self.page_num)
            self.page_num += 1
            yield scrapy.Request(url=new_url, callback=self.parse)

    def parse_detail(self, response):
        job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
        job_desc = ''.join(job_desc)

        # 接收请求传参传递过来的meta字典
        item = response.meta['item']
        item['job_desc'] = job_desc

        # item 数据提交给管道
        yield item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BossproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job_name = scrapy.Field()
    salary = scrapy.Field()
    company_name = scrapy.Field()
    job_desc = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class BossproPipeline(object):
    fp = None

    def open_spider(self, spider):
        self.fp = open('boss.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        self.fp.write(item['job_name'] + ':' + item['salary'] + ':' + item['company_name'] + item['job_desc'] + '\n')

        return item

    def close_spider(self, spider):
        self.fp.close()

middlewares.py
此处只记录需要添加代码内容

# 正常请求
    def process_request(self, request, spider):
        print('正常请求')
        # UA伪装在设置中已设置 利用from fake_useragent import UserAgent / USER_AGENT = UserAgent().random

        # 代理IP设置  代理ip可以在doubanjia  或 ip代理 爬取下来自己用
        # request.meta['porxy'] = random.choice(porxies)
       
        return None