一、redis 队列和栈
-
方式一
import redis class LifoQueue(object): """Per-spider LIFO queue.""" def __init__(self): self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta') def push(self, request): """Push a request""" self.server.lpush("USERS", request) def pop(self, timeout=0): """Pop a request""" data = self.server.lpop('USERS') return # [33,22,11]
-
方式二
import redis class PriorityQueue(object): """Per-spider priority queue abstraction using redis' sorted set""" def __init__(self): self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta') def push(self, request,score): """Push a request""" # data = self._encode_request(request) # score = -request.priority # We don't use zadd method as the order of arguments change depending on # whether the class is Redis or StrictRedis, and the option of using # kwargs only accepts strings, not bytes. self.server.execute_command('ZADD', 'xxxxxx', score, request) def pop(self, timeout=0): """ Pop a request timeout not support in this queue class """ # use atomic range/remove using multi/exec pipe = self.server.pipeline() pipe.multi() pipe.zrange('xxxxxx', 0, 0).zremrangebyrank('xxxxxx', 0, 0) results, count = pipe.execute() if results: return results[0] q = PriorityQueue() q.push('alex',99) q.push('oldboy',56) q.push('eric',77) v1 = q.pop() print(v1) v2 = q.pop() print(v2) v3 = q.pop() print(v3)
二、去重规则
-
方式一(redis集合)
from scrapy.dupefilter import BaseDupeFilter import redis from scrapy.utils.request import request_fingerprint import scrapy_redis class DupFilter(BaseDupeFilter): def __init__(self): self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta') def request_seen(self, request): """ 检测当前请求是否已经被访问过 :param request: :return: True表示已经访问过;False表示未访问过 """ fid = request_fingerprint(request) result = self.conn.sadd('visited_urls', fid) if result == 1: return False return True
-
方式二(redis去重)
from scrapy_redis.dupefilter import RFPDupeFilter from scrapy_redis.connection import get_redis_from_settings from scrapy_redis import defaults class RedisDupeFilter(RFPDupeFilter): @classmethod def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'} debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)
REDIS_HOST = '140.143.227.206' # 主机名 REDIS_PORT = 8888 # 端口 REDIS_PARAMS = {'password':'beta'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8' # REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置) ################ 去重 ###################### DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
三、调度器
-
scrapy crawl chouti --nolog
-
找到 SCHEDULER = “scrapy_redis.scheduler.Scheduler” 配置并实例化调度器对象
- 执行Scheduler.from_crawler
- 执行Scheduler.from_settings
- 读取配置文件:SCHEDULER_PERSIST # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空 SCHEDULER_FLUSH_ON_START # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空 SCHEDULER_IDLE_BEFORE_CLOSE # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。 - 读取配置文件: SCHEDULER_QUEUE_KEY # %(spider)s:requests SCHEDULER_QUEUE_CLASS # scrapy_redis.queue.FifoQueue SCHEDULER_DUPEFILTER_KEY # '%(spider)s:dupefilter' DUPEFILTER_CLASS # 'scrapy_redis.dupefilter.RFPDupeFilter' SCHEDULER_SERIALIZER # "scrapy_redis.picklecompat" - 读取配置文件: REDIS_HOST = '140.143.227.206' # 主机名 REDIS_PORT = 8888 # 端口 REDIS_PARAMS = {'password':'beta'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) REDIS_ENCODING = "utf-8" - 实例Scheduler对象
-
爬虫开始执行起始URL
调用 scheduler.enqueue_requests() def enqueue_request(self, request): # 请求是否需要过滤? # 去重规则中是否已经有?(是否已经访问过,如果未访问添加到去重记录中。) if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) # 已经访问过就不要再访问了 return False if self.stats: self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) # print('未访问过,添加到调度器', request) self.queue.push(request) return True
-
下载器去调度器中获取任务,去下载
- 调用 scheduler.next_requests() def next_request(self): block_pop_timeout = self.idle_before_close request = self.queue.pop(block_pop_timeout) if request and self.stats: self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) return request
四、深度优先,广度优先
-
什么是深度优先?什么是广度优先?
-
scrapy中如何实现深度和广度优先?
先进先出,广度优先 后进先出,深度优先 优先级队列: DEPTH_PRIORITY = 1 # 广度优先 DEPTH_PRIORITY = -1 # 深度优先
-
scrapy中 调度器 和 队列 和 dupefilter的关系?
调度器,调配添加或获取那个request. 队列,存放request。 dupefilter,访问记录。
-
配置
连接redis配置: REDIS_HOST = '140.143.227.206' # 主机名 REDIS_PORT = 8888 # 端口 REDIS_PARAMS = {'password':'beta'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8' 去重的配置: DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 调度器配置: SCHEDULER = "scrapy_redis.scheduler.Scheduler" DEPTH_PRIORITY = 1 # 广度优先 # DEPTH_PRIORITY = -1 # 深度优先 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) # 广度优先 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) # 深度优先 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 调度器中请求存放在redis中的key SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 对保存到redis中的数据进行序列化,默认使用pickle SCHEDULER_PERSIST = False # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空 SCHEDULER_FLUSH_ON_START = True # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空 # SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。 SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' # 去重规则,在redis中保存时对应的key # 优先使用DUPEFILTER_CLASS,如果么有就是用SCHEDULER_DUPEFILTER_CLASS SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 去重规则对应处理的类