import six
from scrapy.utils.misc import load_object
from.import defaults
# Shortcut maps 'setting name' -> 'parmater name'.# redis数据库的关系映射
SETTINGS_PARAMS_MAP ={
'REDIS_URL':'url','REDIS_HOST':'host','REDIS_PORT':'port','REDIS_ENCODING':'encoding',}defget_redis_from_settings(settings):# 获取一个redis连接实例# 生成连接redis参数"""Returns a redis client instance from given Scrapy settings object.
This function uses ``get_client`` to instantiate the client and uses
``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You
can override them using the ``REDIS_PARAMS`` setting.
Parameters
----------
settings : Settings
A scrapy settings object. See the supported settings below.
Returns
-------
server
Redis client instance.
Other Parameters
----------------
REDIS_URL : str, optional
Server connection URL.
REDIS_HOST : str, optional
Server host.
REDIS_PORT : str, optional
Server port.
REDIS_ENCODING : str, optional
Data encoding.
REDIS_PARAMS : dict, optional
Additional client parameters.
"""# 浅拷贝,是为了防止params改变,会导致默认的REDIS_PARAMS被改变
params = defaults.REDIS_PARAMS.copy()# 将settings中的参数更新到params
params.update(settings.getdict('REDIS_PARAMS'))# XXX: Deprecate REDIS_* settings.# 遍历映射表,获取指定的参数for source, dest in SETTINGS_PARAMS_MAP.items():# 优先使用settings中的参数
val = settings.get(source)# 如果settings中没有进行设置,则params不更新if val:
params[dest]= val
# Allow ``redis_cls`` to be a path to a class.ifisinstance(params.get('redis_cls'), six.string_types):
params['redis_cls']= load_object(params['redis_cls'])return get_redis(**params)# Backwards compatible alias.
from_settings = get_redis_from_settings
defget_redis(**kwargs):"""Returns a redis client instance.
Parameters
----------
redis_cls : class, optional
Defaults to ``redis.StrictRedis``.
url : str, optional
If given, ``redis_cls.from_url`` is used to instantiate the class.
**kwargs
Extra parameters to be passed to the ``redis_cls`` class.
Returns
-------
server
Redis client instance.
"""# 没有redis_cli,则默认redis连接
redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS)
url = kwargs.pop('url',None)# 判断kwargs有没有urlif url:#用url链接redis,优先使用url连接redisreturn redis_cls.from_url(url,**kwargs)else:#用字典的方式连接redisreturn redis_cls(**kwargs)
import logging
import time
from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
from.import defaults
from.connection import get_redis_from_settings
logger = logging.getLogger(__name__)# scrapy去重是利用集合实现的# TODO: Rename class to RedisDupeFilter.classRFPDupeFilter(BaseDupeFilter):"""Redis-based request duplicates filter.
This class can also be used with default Scrapy's scheduler.
"""
logger = logger
def__init__(self, server, key, debug=False):"""Initialize the duplicates filter.
Parameters
----------
server : redis.StrictRedis
The redis server instance.
redis 连接实例
key : str 存储requests指纹的地方
Redis key Where to store fingerprints.
debug : bool, optional
Whether to log filtered requests.
是否记录过滤的requests
"""#看server是如何生成的,因为我们通过server就可以获取redis中的队列或者set
self.server = server
self.key = key
self.debug = debug
self.logdupes =True# 类方法传递当前的方法
@classmethoddeffrom_settings(cls, settings):"""Returns an instance from given settings.
This uses by default the key ``dupefilter:<timestamp>``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key.
Parameters
----------
settings : scrapy.settings.Settings
Returns
-------
RFPDupeFilter
A RFPDupeFilter instance.
"""# 获取redis的连接实例
server = get_redis_from_settings(settings)# XXX: This creates one-time key. needed to support to use this# class as standalone dupefilter with scrapy's default scheduler# if scrapy passes spider on open() method this wouldn't be needed# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.# 存取指纹的key
key = defaults.DUPEFILTER_KEY %{
'timestamp':int(time.time())}
debug = settings.getbool('DUPEFILTER_DEBUG')# 默认值是false# 传给当前类,并把参数传递给init函数return cls(server, key=key, debug=debug)
@classmethoddeffrom_crawler(cls, crawler):"""Returns instance from crawler.
Parameters
----------
crawler : scrapy.crawler.Crawler
Returns
-------
RFPDupeFilter
Instance of RFPDupeFilter.
"""return cls.from_settings(crawler.settings)defrequest_seen(self, request):"""Returns True if request was already seen.
Parameters
----------
request : scrapy.http.Request
Returns
-------
bool
"""
fp = self.request_fingerprint(request)# 生成一个指纹# This returns the number of values added, zero if already exists.# 将 指纹加入redis 是一个集合类型# self.server redis连接实例# self.key 存储指纹的key# fp 就是指纹
added = self.server.sadd(self.key, fp)# 当added为0,说明指纹已经存在,返回True,否则返回Falsereturn added ==0defrequest_fingerprint(self, request):"""Returns a fingerprint for a given request.
Parameters
----------
request : scrapy.http.Request
Returns
-------
str
"""return request_fingerprint(request)
@classmethoddeffrom_spider(cls, spider):
settings = spider.settings
server = get_redis_from_settings(settings)
dupefilter_key = settings.get("