框架Scrapy中爬虫深度控制
源码位置
from scrapy.spidermiddlewares.depth import DepthMiddleware
源码解析
import logging from scrapy.http import Request logger = logging.getLogger(__name__) class DepthMiddleware(object): def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1): self.maxdepth = maxdepth self.stats = stats self.verbose_stats = verbose_stats self.prio = prio @classmethod def from_crawler(cls, crawler): settings = crawler.settings # 配置文件读取DEPTH_LIMIT限制 maxdepth = settings.getint('DEPTH_LIMIT') verbose = settings.getbool('DEPTH_STATS_VERBOSE') # 优先级配置文件 prio = settings.getint('DEPTH_PRIORITY') return cls(maxdepth, crawler.stats, verbose, prio) def process_spider_output(self, response, result, spider): # 放到调度器中的request对象 def _filter(request): if isinstance(request, Request): depth = response.meta['depth'] + 1 request.meta['depth'] = depth if self.prio: # 优先级 = depth * self.prio【原来的优先级为0】 request.priority -= depth * self.prio # 如果当前深度大于最大深度则返回False if self.maxdepth and depth > self.maxdepth: logger.debug( "Ignoring link (depth > %(maxdepth)d): %(requrl)s ", {'maxdepth': self.maxdepth, 'requrl': request.url}, extra={'spider': spider} ) return False elif self.stats: if self.verbose_stats: self.stats.inc_value('request_depth_count/%s' % depth, spider=spider) self.stats.max_value('request_depth_max', depth, spider=spider) return True # base case (depth=0) depth开始肯定是为0的【成立】 if self.stats and 'depth' not in response.meta: # 主动赋值response.meta['depth']=0 # 相当于 response.request.meta['dapth']=0 response.meta['depth'] = 0 if self.verbose_stats: self.stats.inc_value('request_depth_count/0', spider=spider) # result 相当于个人逻辑中的 yield Request(url='https://dig.chouti.com/login',callback=self.check_login ) # 相当于做筛选,循环result中的每一个request对象, # _filter 返回True:放到调度器 # _filter 返回false:丢弃 return (r for r in result or () if _filter(r))
对深度的解释
深度 最开始深度为0 每次yield时,会根据原来请求中的depth + 1 配置:DEPTH_LIMIT 深度控制 优先级 被请求优先级 -= 深度 * 配置【DEPTH_PRIORITY】 配置:DEPTH_PRIORITY