scrapy_redis在scrapy的基础上有了更强大的功能:
request去重,爬虫持久化,实现分布式
与scrapy标准写法的区别:
1、继承的类不同
2、增加redis_key 3、没有start_urls
一、RedisSpider
scrapy genspider suning suning.com
# 继承自RedisSpider
from scrapy_redis.spiders import RedisSpider
class SuningSpider(RedisSpider):
redis_key ="suning"
name = 'suning'
allowed_domains = ['book.suning.com']
# start_urls = ['https://book.suning.com/?safp=d488778a.10038.0.8cca61ce53']
def parse(self, response):
"""
获取苏宁图书的所有图书大标题分类与小标题分类
"""
二、RedisCrawlSpider
适用于可实现自动翻页
scrapy genspider -t crawl suning suning.com
# 继承自RedisCrawlSpider
from scrapy_redis.spiders import RedisCrawlSpider
class Suning2Spider(RedisCrawlSpider):
redis_key = "suning"
name = 'suning2'
allowed_domains = ['book.suning.com']
# start_urls = ['http://book.suning.com']
# LinkExtractor定义了一个url的提取规则 ,满足条件的交给callback函数处理
rules = (
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
pass
settings.py
添加四行设置信息,redis_key, 和调度器
“”"
打开item_pipelines 通道
“”"
ITEM_PIPELINES = {
‘myscrapy.pipelines.MyscrapyPipeline’: 300,
# ‘scrapy_redis.pipelines.RedisPipeline’: 400, # scrapy_redis实现item保存到redis的pipeline
}
REDIS_URL = 'redis://127.0.0.1:6379' # 指定redis的地址
# redis的地址也可这样书写
# REDIS_HOST = "192.168.207.134"
# REDIS_PORT = 6379
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 指定哪个去重方法给request对象去重
SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 指定scheduler队列
SCHEDULER_PERSIST = True # 队列中的内容是否持久保存,为False时在关闭redis时清空redis
执行爬虫后redis会多出三个键: