一、创建普通scrapy项目
二、spiders爬虫文件中修改项
1 import scrapy 2 from XX.items import XXItem 3 import json 4 # ----1 导入类 5 from scrapy_redis.spiders import RedisSpider 6 7 8 # ----2 修改类的继承 9 class BookSpider(RedisSpider): 10 name = 'book' 11 12 # ----3 注销起始的url和允许的域 13 # allowed_domains = ['xx.com', 'x.x.cn'] 14 # start_urls = ['https://xxxx.xx.com/xxxxx.html'] 15 16 # ----4设置 rediskey 17 redis_key = 'start_url' 18 19 # ----5 动态获取允许的域 20 def __init__(self, *args, **kwargs): 21 domain = kwargs.pop("domains", "") 22 self.alllowed_domains = filter(None, domain.split(',')) 23 print("start_url:", self.alllowed_domains) 24 super(BookSpider, self).__init__(*args, **kwargs) 25 26 27 def parse(self, response): 28 pass
三、配置文件settings中配置项
1 SPIDER_MODULES = ['XX.spiders'] 2 NEWSPIDER_MODULE = 'XX.spiders' 3 4 USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)' 5 6 # 设置重复过滤器模块,使重复过滤器使用redis中的集合进行去重 7 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 8 # 设置调度器模块,是调度器能够使用redis中的列表作为任务队列,储存和使用请求对象 9 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 10 # 设置redis数据库中的任务队列是否保持 11 SCHEDULER_PERSIST = True 12 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" 13 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 14 #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" 15 16 ITEM_PIPELINES = { 17 # 'example.pipelines.ExamplePipeline': 300, 18 # 只要添加该管道,则数据都会往redis队列中做存储 19 'scrapy_redis.pipelines.RedisPipeline': 400, 20 } 21 22 LOG_LEVEL = 'DEBUG' 23 24 REDIS_URL = "redis://127.0.0.1:6379" 25 26 # Introduce an artifical delay to make use of parallelism. to speed up the 27 # crawl. 28 DOWNLOAD_DELAY = 1
四、redis中设置开始url
1.同一个爬虫分窗口启动多次
命令:scrapy runspider spider_name.py
2.启动redis客户端设置开始url
redis客户端执行:lpush start_url http://baidu.com