Scrapy爬虫使用布隆过滤过滤重复URL scrapy-redis-bloomfilter-block-cluster redis集群化
首先安装布隆过滤器
pip install scrapy-redis-bloomfilter-block-cluster
设置爬虫的settings.py
# 确保使用此调度程序
SCHEDULER = "scrapy_redis_bloomfilter_block_cluster.scheduler.Scheduler"
# 持久化
SCHEDULER_PERSIST = True
# 确保所有蜘蛛通过redis共享相同的重复过滤器
DUPEFILTER_CLASS = "scrapy_redis_bloomfilter_block_cluster.dupefilter.RFPDupeFilter"
# 队列
SCHEDULER_QUEUE_CLASS = 'scrapy_redis_bloomfilter_block_cluster.queue.PriorityQueue'
# Redis的URL
# REDIS_URL = '的Redis://:为admin123 @本地:6379' #或redis的://本地主机:6379
# REDIS_HOST = '本地主机'
# REDIS_PORT = 6379
# Redis的集群,如果REDIS_MASTER_NODES设置,REDIS_URL不起作用。
REDIS_CLUSTER_NODES = [
{"host": "", "port": ""},
{"host": "", "port": ""},
{"host": "", "port": ""},
{"host": "", "port": ""},
{"host": "", "port": ""},
{"host": "", "port": ""},
]
# 要使用的哈希函数数,至少 为6
BLOOMFILTER_HASH_NUMBER = 6
# 布隆过滤器用法的Redis存储位,30个装置2 ^ 30 = 128MB,赋予数值30
BLOOMFILTER_BIT = 30
# 使用Bloomfilter的块数,一个块可以使用最大内存512MB
BLOOMFILTER_BLOCK_NUM = 1
DUPEFILTER_DEBUG = True
#不想过滤起始URL的设置
打开爬虫主程序
把dont_filter赋值为True即可
其他的也是如果不想过滤文章URL则在爬取文章方法里设置同样即可
Request = scrapy.Request(url=config['start_url'], callback=self.parse, dont_filter=True)