1.创建scrapy项目
2.安装scrapy redis
pip install scrapy-redis
3.设置setting.py
3.1 添加item_piplines
ITEM_PIPELINES = {
# scrapyredis配置'scrapy_redis.pipelines.RedisPipeline':400}
3.2 添加scrapy-redis属性配置
""" scrapy-redis配置 """# Enables scheduling storing requests queue in redis.SCHEDULER = "scrapy_redis.scheduler.Scheduler"# Ensure all spiders share same duplicates filter through redis.DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 调度器启用Redis存储Requests队列#SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有的爬虫实例使用Redis进行重复过滤#DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 将Requests队列持久化到Redis,可支持暂停或重启爬虫#SCHEDULER_PERSIST = True
# Requests的调度策略,默认优先级队列#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'