提前安装scrapy_redis库,下载redis数据库,打开redis服务端,然后直接创建scrapy文件,最后添加以下常量至settings文件即可实现分布式
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True # 为false Redis关闭了 Redis数据也会被清空
REDIS_URL = "redis://127.0.0.1:6379"
直接使用scrapy框架的方式,以下为spider文件
import scrapy
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com', 'p.3.cn']
start_urls = ['https://book.jd.com/booksort.html']
def parse(self, response):
# 大分类列表
dt_list = response.xpath("//div[@class='mc']/dl/dt")
for dt in dt_list:
item = {}
item['b_cate'] = dt.xpath("./a/text()").extract_first()
yield item
创建rediscrawlspider的时候需要继承导入的RedisCrawlSpider,并且添加start_url的时候需要用redis客户端使用lpush压入redis_key才能启动,如果是继承RedisSpider的话,也是需要压入start_urls的。下面是spider文件
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_redis.spiders import RedisCrawlSpider
class DangdangSpider(RedisCrawlSpider):
name = 'mycrawler_redis'
redis_key = 'mycrawler:start_urls'
allowed_domains = ['dangdang.com']
rules = (
# follow all links
# 列表页面
Rule(LinkExtractor(restrict_xpaths="//ul[@class='title-state-ul']/li"), callback='parse_page'),
# 列表页面翻页
Rule(LinkExtractor(restrict_xpaths="//a[@class='arrow-page prov_rota']"), follow=True),
)
def parse_page(self, response):
item = {}
item['content'] = response.xpath("//div[@class='details-box']/pre/text()").extract()
print(item)