使用scrapy_redis实现京东图书分布式爬虫
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for jdbook project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'jdbook'
SPIDER_MODULES = ['jdbook.spiders']
NEWSPIDER_MODULE = 'jdbook.spiders'
USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'jdbook (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# 启用Redis调度存储请求队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有的爬虫通过Redis去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 默认请求序列化使用的是pickle 但是我们可以更改为其他类似的。PS:这玩意儿2.X的可以用。3.X的不能用
# SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"
# 不清除Redis队列、这样可以暂停/恢复 爬取
SCHEDULER_PERSIST = True
# 使用优先级调度请求队列 (默认使用)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# 可选用的其它队列
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
# 最大空闲时间防止分布式爬虫因为等待而关闭
# SCHEDULER_IDLE_BEFORE_CLOSE = 10
# 将清除的项目在redis进行处理
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300
}
REDIS_URL = 'redis://@127.0.0.1:6379'
jd.py
# -*- coding: utf-8 -*-
import scrapy
import copy
import json
import urllib
from scrapy_redis.spiders import RedisSpider
class JdSpider(RedisSpider):
name = 'jd'
allowed_domains = ['jd.com', 'p.3.cn']
# start_urls = ['https://book.jd.com/booksort.html']
redis_key = "jdbook"
def parse(self, response):
item = {}
dt_list = response.xpath('//div[@class="mc"]/dl/dt')
for dt in dt_list:
item["dt_title"] = dt.xpath('./a/text()').extract_first()
item["dd_title"] = dt.xpath('./following-sibling::dd[1]/em/a/text()').extract_first()
dd_url = "https://" + dt.xpath('./following-sibling::dd[1]/em/a/@href').extract_first()
yield scrapy.Request(
dd_url,
callback=self.parse_book_list,
meta={"item": copy.deepcopy(item)}
)
def parse_book_list(self, response):
item = response.meta["item"]
book_list = response.xpath('//li[@class="gl-item"]')
for book in book_list:
item["book_title"] = book.xpath('.//div[@class="p-name"]/a/em/text()').extract_first().strip()
item["book_author"] = book.xpath('.//span[@class="author_type_1"]/a/text()').extract()
item["book_author"] = [i.strip() for i in item["book_author"]]
item["book_store"] = book.xpath('.//span[@class="p-bi-store"]/a/text()').extract_first()
sku_id = book.xpath('./div/@data-sku').extract_first()
if sku_id:
yield scrapy.Request(
"https://p.3.cn/prices/mgets?skuIds=J_" + sku_id,
callback=self.parse_price,
meta={"item": copy.deepcopy(item)}
)
next_url = urllib.parse.urljoin(response.url, response.xpath('//a[@class="pn-next"]/@href').extract_first())
yield scrapy.Request(
next_url,
callback=self.parse_book_list,
meta={"item": item}
)
def parse_price(self, response):
item = response.meta["item"]
item["book_price"] = json.loads(response.text)[0].get('op')
print(item)