一、自身的spider.py设置
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
redis_key = 'myspider:start_urls'
def __init__(self, *args, **kwargs):
domain = kwargs.pop('domain', '') #当我们初始化爬虫时,如果定义了初始域名,
#则采用
self.allowed_domains = filter(None, domain.split(',')) #对域名进行筛选
super(MySpider, self).__init__(*args, **kwargs)
二、配置文件settings.py的设置
# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'
USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #是否筛选
SCHEDULER = "scrapy_redis.scheduler.Scheduler" #调度器
SCHEDULER_PERSIST = True #断点续传
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
ITEM_PIPELINES = {
'example.pipelines.ExamplePipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400, /自动放到redis数据库
}
LOG_LEVEL = 'DEBUG'
# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1
REDIS_URL = 'redis://127.0.0.1:6379' #设置redis服务