爬虫基础12(框架Scrapy中间件)

Scrapy中间件

  爬虫中间件

class SpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        return s
        
    def process_spider_input(self,response, spider):
        """
        下载完成,执行,然后交给parse处理
        :param response: 
        :param spider: 
        :return: 
        """
        pass

    def process_spider_output(self,response, result, spider):
        """
        spider处理完成,返回时调用
        :param response:
        :param result:
        :param spider:
        :return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
        """
        return result

    def process_spider_exception(self,response, exception, spider):
        """
        异常调用
        :param response:
        :param exception:
        :param spider:
        :return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
        """
        return None

    # 只在爬虫启动时,执行一次。
    def process_start_requests(self,start_requests, spider):
        """
        爬虫启动时调用
        :param start_requests:
        :param spider:
        :return: 包含 Request 对象的可迭代对象
        """
        return start_requests

配置方法:

SPIDER_MIDDLEWARES = {
        'xdb.sd.SpiderMiddleware': 666,
        'xdb.sd.Sd2': 667,
    }

应用:

- 深度
- 优先级

下载中间介件

class DownMiddleware1(object):
    
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        return s
    
    def process_request(self, request, spider):
        """
        请求需要被下载时,经过所有下载器中间件的process_request调用
        :param request: 
        :param spider: 
        :return:  
            None,继续后续中间件去下载;
            Response对象,停止process_request的执行,开始执行process_response
            Request对象,停止中间件的执行,将Request重新调度器
            raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
        """
        # 1. 返回Response
        # import requests
        # result = requests.get(request.url)
        # return HtmlResponse(url=request.url, status=200, headers=None, body=result.content)
        # 2. 返回Request
        # return Request('https://dig.chouti.com/r/tec/hot/1')

        # 3. 抛出异常
        # from scrapy.exceptions import IgnoreRequest
        # raise IgnoreRequest

        # 4. 对请求进行加工(*)
        # request.headers['user-agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
        pass



    def process_response(self, request, response, spider):
        """
        spider处理完成,返回时调用
        :param response:
        :param result:
        :param spider:
        :return: 
            Response 对象:转交给其他中间件process_response
            Request 对象:停止中间件,request会被重新调度下载
            raise IgnoreRequest 异常:调用Request.errback
        """
        print('response1')
        return response

    def process_exception(self, request, exception, spider):
        """
        当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
        :param response:
        :param exception:
        :param spider:
        :return: 
            None:继续交给后续中间件处理异常;
            Response对象:停止后续process_exception方法
            Request对象:停止中间件,request将会被重新调用下载
        """
        return None

配置方法:

DOWNLOADER_MIDDLEWARES = {
   #'xdb.middlewares.XdbDownloaderMiddleware': 543,
    # 'xdb.proxy.XdbProxyMiddleware':751,
    'xdb.md.DownMiddleware1':666,
    'xdb.md.Md2':667,
}

应用:

- user-agent 
- 代理 

 

转载于:https://www.cnblogs.com/L5251/articles/9276341.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值