爬虫基础9(框架Scrapy中爬虫深度控制)

最新推荐文章于 2024-08-18 23:40:08 发布

weixin_30357231

最新推荐文章于 2024-08-18 23:40:08 发布

阅读量205

点赞数

文章标签：爬虫 python

原文链接：http://www.cnblogs.com/L5251/articles/9265297.html

版权

框架Scrapy中爬虫深度控制

源码位置

from scrapy.spidermiddlewares.depth import DepthMiddleware

源码解析

import logging

from scrapy.http import Request

logger = logging.getLogger(__name__)


class DepthMiddleware(object):

    def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1):
        self.maxdepth = maxdepth
        self.stats = stats
        self.verbose_stats = verbose_stats
        self.prio = prio

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        # 配置文件读取DEPTH_LIMIT限制
        maxdepth = settings.getint('DEPTH_LIMIT')
        verbose = settings.getbool('DEPTH_STATS_VERBOSE')
        # 优先级配置文件
        prio = settings.getint('DEPTH_PRIORITY')
        return cls(maxdepth, crawler.stats, verbose, prio)

    def process_spider_output(self, response, result, spider):
        # 放到调度器中的request对象
        def _filter(request):
            if isinstance(request, Request):
                depth = response.meta['depth'] + 1
                request.meta['depth'] = depth
                if self.prio:
                    # 优先级 = depth * self.prio【原来的优先级为0】
                    request.priority -= depth * self.prio
                # 如果当前深度大于最大深度则返回False
                if self.maxdepth and depth > self.maxdepth:
                    logger.debug(
                        "Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
                        {'maxdepth': self.maxdepth, 'requrl': request.url},
                        extra={'spider': spider}
                    )
                    return False
                elif self.stats:
                    if self.verbose_stats:
                        self.stats.inc_value('request_depth_count/%s' % depth,
                                             spider=spider)
                    self.stats.max_value('request_depth_max', depth,
                                         spider=spider)
            return True

        # base case (depth=0) depth开始肯定是为0的【成立】
        if self.stats and 'depth' not in response.meta:
            # 主动赋值response.meta['depth']=0
            # 相当于 response.request.meta['dapth']=0
            response.meta['depth'] = 0
            if self.verbose_stats:
                self.stats.inc_value('request_depth_count/0', spider=spider)
        # result 相当于个人逻辑中的 yield Request(url='https://dig.chouti.com/login',callback=self.check_login )
        # 相当于做筛选，循环result中的每一个request对象，
        # _filter 返回True:放到调度器
        # _filter 返回false:丢弃
        return (r for r in result or () if _filter(r))

对深度的解释

深度
    最开始深度为0
    每次yield时，会根据原来请求中的depth + 1
    配置：DEPTH_LIMIT 深度控制
优先级
    被请求优先级 -= 深度 * 配置【DEPTH_PRIORITY】
    配置：DEPTH_PRIORITY

转载于:https://www.cnblogs.com/L5251/articles/9265297.html