scrapy_redis项目配置

最新推荐文章于 2024-09-29 08:44:52 发布

weixin_30429201

最新推荐文章于 2024-09-29 08:44:52 发布

阅读量122

点赞数

文章标签：数据库 python 爬虫

原文链接：http://www.cnblogs.com/returnes/p/10461467.html

版权

一、创建普通scrapy项目

二、spiders爬虫文件中修改项

 1 import scrapy
 2 from XX.items import XXItem
 3 import json
 4 # ----1 导入类
 5 from scrapy_redis.spiders import RedisSpider
 6 
 7 
 8 # ----2 修改类的继承
 9 class BookSpider(RedisSpider):
10     name = 'book'
11 
12     # ----3 注销起始的url和允许的域
13     # allowed_domains = ['xx.com', 'x.x.cn']
14     # start_urls = ['https://xxxx.xx.com/xxxxx.html']
15 
16     # ----4设置 rediskey
17     redis_key = 'start_url'
18 
19     # ----5 动态获取允许的域
20     def __init__(self, *args, **kwargs):
21         domain = kwargs.pop("domains", "")
22         self.alllowed_domains = filter(None, domain.split(','))
23         print("start_url:", self.alllowed_domains)
24         super(BookSpider, self).__init__(*args, **kwargs)
25 
26 
27     def parse(self, response):
28         pass

View Code

三、配置文件settings中配置项

 1 SPIDER_MODULES = ['XX.spiders']
 2 NEWSPIDER_MODULE = 'XX.spiders'
 3 
 4 USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'
 5 
 6 # 设置重复过滤器模块，使重复过滤器使用redis中的集合进行去重
 7 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 8 # 设置调度器模块，是调度器能够使用redis中的列表作为任务队列，储存和使用请求对象
 9 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
10 # 设置redis数据库中的任务队列是否保持
11 SCHEDULER_PERSIST = True
12 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
13 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
14 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
15 
16 ITEM_PIPELINES = {
17     # 'example.pipelines.ExamplePipeline': 300,
18     # 只要添加该管道，则数据都会往redis队列中做存储
19     'scrapy_redis.pipelines.RedisPipeline': 400,
20 }
21 
22 LOG_LEVEL = 'DEBUG'
23 
24 REDIS_URL = "redis://127.0.0.1:6379"
25 
26 # Introduce an artifical delay to make use of parallelism. to speed up the
27 # crawl.
28 DOWNLOAD_DELAY = 1