这几天完成了分布式爬虫的学习,发现了解scrapy-redis源代码对于分布式爬虫的学习真的很重要,废话少说,直接上干货:
文章目录
1.创建项目
首先,我们要创建一个Scrapy项目来进行学习。创建项目用下面的代码在Terminal中执行。
scrapy startproject 项目名称
如图:
然后我们需要将scrapy-redis的源码拷贝到scrapy中,源码下载地址如下:
https://github.com/rmax/scrapy-redis
我们可以通过下载zip文件和git clone两种方式进行下载,个人建议还是用常见的下载zip文件比较方便。
2.源代码解析
分布式爬虫系统的结构如下:
scrapy-redis的源码中主要有以下几个文件:
connection.py
defaults.py
dupefilter.py
picklecompat.py
pipeline.py
queue.py
scheduler.py
spiders.py
utils.py
2.1 Connection.py
这个文件是用来连接redis的文件,与其他文件相比较而言,这个文件用到的次数是很多的,也是最重要的文件,Connection提供了一个非常重要的参数。pipeline,queue,scheduler文件都会调用。Connection.py文件解析如下:
import six
from scrapy.utils.misc import load_object
from . import defaults
# Shortcut maps 'setting name' -> 'parmater name'.
# redis数据库的关系映射
SETTINGS_PARAMS_MAP = {
'REDIS_URL': 'url',
'REDIS_HOST': 'host',
'REDIS_PORT': 'port',
'REDIS_ENCODING': 'encoding',
}
def get_redis_from_settings(settings):
# 获取一个redis连接实例
# 生成连接redis参数
"""Returns a redis client instance from given Scrapy settings object.
This function uses ``get_client`` to instantiate the client and uses
``defaults.REDIS_PARAMS`` global as defaults values for the parameters. You
can override them using the ``REDIS_PARAMS`` setting.
Parameters
----------
settings : Settings
A scrapy settings object. See the supported settings below.
Returns
-------
server
Redis client instance.
Other Parameters
----------------
REDIS_URL : str, optional
Server connection URL.
REDIS_HOST : str, optional
Server host.
REDIS_PORT : str, optional
Server port.
REDIS_ENCODING : str, optional
Data encoding.
REDIS_PARAMS : dict, optional
Additional client parameters.
"""
# 浅拷贝,是为了防止params改变,会导致默认的REDIS_PARAMS被改变
params = defaults.REDIS_PARAMS.copy()
# 将settings中的参数更新到params
params.update(settings.getdict('REDIS_PARAMS'))
# XXX: Deprecate REDIS_* settings.
# 遍历映射表,获取指定的参数
for source, dest in SETTINGS_PARAMS_MAP.items():
# 优先使用settings中的参数
val = settings.get(source)
# 如果settings中没有进行设置,则params不更新
if val:
params[dest] = val
# Allow ``redis_cls`` to be a path to a class.
if isinstance(params.get('redis_cls'), six.string_types):
params['redis_cls'] = load_object(params['redis_cls'])
return get_redis(**params)
# Backwards compatible alias.
from_settings = get_redis_from_settings
def get_redis(**kwargs):
"""Returns a redis client instance.
Parameters
----------
redis_cls : class, optional
Defaults to ``redis.StrictRedis``.
url : str, optional
If given, ``redis_cls.from_url`` is used to instantiate the class.
**kwargs
Extra parameters to be passed to the ``redis_cls`` class.
Returns
-------
server
Redis client instance.
"""
# 没有redis_cli,则默认redis连接
redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS)
url = kwargs.pop('url', None) # 判断kwargs有没有url
if url:
#用url链接redis,优先使用url连接redis
return redis_cls.from_url(url, **kwargs)
else:
#用字典的方式连接redis
return redis_cls(**kwargs)
2.2 defaults.py
import redis
# For standalone use.
# 去重的键名
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
# 定义的存储items的键名(key),spider是爬虫的名称
PIPELINE_KEY = '%(spider)s:items'
# Redis的连接对象,用于连接redis
REDIS_CLS = redis.StrictRedis
# 字符集编码
REDIS_ENCODING = 'utf-8'
# Sane connection defaults.
# redis数据库的连接参数
REDIS_PARAMS = {
'socket_timeout': 30,
'socket_connect_timeout': 30,
'retry_on_timeout': True,
'encoding': REDIS_ENCODING,
}
# 队列的变量名,用于存储爬取的url队列
SCHEDULER_QUEUE_KEY = '%(spider)s:requests'
# 优先级队列,用于规定队列的进出方式
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# 用于去重的key值,给request加指纹存储的地方
SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'
# 用于生成指纹的类
SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
#起始url对应的类(key)
START_URLS_KEY = '%(name)s:start_urls'
#起始url的类型
START_URLS_AS_SET = False
2.3 dupefilter.py
import logging
import time
from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
from . import defaults
from .connection import get_redis_from_settings
logger = logging.getLogger(__name__)
# scrapy去重是利用集合实现的
# TODO: Rename class to RedisDupeFilter.
class RFPDupeFilter(BaseDupeFilter):
"""Redis-based request duplicates filter.
This class can also be used with default Scrapy's scheduler.
"""
logger = logger
def __init__(self, server, key, debug=False):
"""Initialize the duplicates filter.
Parameters
----------
server : redis.StrictRedis
The redis server instance.
redis 连接实例
key : str 存储requests指纹的地方
Redis key Where to store fingerprints.
debug : bool, optional
Whether to log filtered requests.
是否记录过滤的requests
"""
#看server是如何生成的,因为我们通过server就可以获取redis中的队列或者set
self.server = server
self.key = key
self.debug = debug
self.logdupes = True
# 类方法传递当前的方法
@classmethod
def from_settings(cls, settings):
"""Returns an instance from given settings.
This uses by default the key ``dupefilter:<timestamp>``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key.
Parameters
----------
settings : scrapy.settings.Settings
Returns
-------
RFPDupeFilter
A RFPDupeFilter instance.
"""
# 获取redis的连接实例
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
# 存取指纹的key
key = defaults.DUPEFILTER_KEY % {
'timestamp': int(time.time())}
debug = settings.getbool('DUPEFILTER_DEBUG') # 默认值是false
# 传给当前类,并把参数传递给init函数
return cls(server, key=key, debug=debug)
@classmethod
def from_crawler(cls, crawler):
"""Returns instance from crawler.
Parameters
----------
crawler : scrapy.crawler.Crawler
Returns
-------
RFPDupeFilter
Instance of RFPDupeFilter.
"""
return cls.from_settings(crawler.settings)
def request_seen(self, request):
"""Returns True if request was already seen.
Parameters
----------
request : scrapy.http.Request
Returns
-------
bool
"""
fp = self.request_fingerprint(request) # 生成一个指纹
# This returns the number of values added, zero if already exists.
# 将 指纹加入redis 是一个集合类型
# self.server redis连接实例
# self.key 存储指纹的key
# fp 就是指纹
added = self.server.sadd(self.key, fp)
# 当added为0,说明指纹已经存在,返回True,否则返回False
return added == 0
def request_fingerprint(self, request):
"""Returns a fingerprint for a given req