GKD
# -*- coding: utf-8 -*-
import scrapy
from urllib.parse import urljoin
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider
class DangdangSpider(RedisSpider):
name = 'dangdang'
allowed_domains = ['dangdang.com']
#start_urls = ['http://book.dangdang.com/']
redis_key = "dangdang"
def parse(self, response):
div_list = response.xpath('//div[@class="con flq_body"]/div')
for div in div_list:
item = {}
item["large_title"] = div.xpath('./dl/dt//text()').extract()
item["large_title"] = [i.strip() for i in item["large_title"] if len(i.strip())>0]
dl_list = div.xpath('.//dl[@class="inner_dl"]')
for dl in dl_list:
item["middle_title"] = dl.xpath('./dt//text()').extract()
item["middle_title"] = [i.strip() for i in item["middle_title"] if len(i.strip())>0]
a_list = dl.xpath('./dd/a')
for a in a_list:
item["small_title"] = dl.xpath('./dd/a/text()').extract_first()
item["small_href"] = dl.xpath('./dd/a/@href').extract_first()
if item["small_href"] is not None:
yield scrapy.Request(
item["small_href"],
meta={"item": deepcopy(item)},
callback=self.detail_parse
)
def detail_parse(self,response):
item = response.meta["item"]
li_list = response.xpath('//ul[@class="bigimg"]/li').extract_first()
# if li_list is not None:
for li in li_list:
item["img"] = li.xpath('./a/img/@data-original').extract_first()
item["name"] = li.xpath('./p[@class="name"]/a/text()').extract_first()
item["price"] = li.xpath('./p[@class="price"]/span/text()').extract_first()
item["detail"] = li.xpath('./p[@class="detail"]/text()').extract_first()
item["author"] = li.xpath('./p[@class="search_book_author"]/span[1]//text()').extract()
item["date"] = li.xpath('./p[@class="search_book_author"]/span[2]/text()').extract_first()
item["press"] = li.xpath('./p[@class="search_book_author"]/span[3]/a/text()').extract_first()
yield item
next_url = urljoin(response.url,response.xpath('//li[@class="next"]/a[text()=下一页]/@href'))
if next_url is not None:
yield scrapy.Request(
next_url,
meta={"item": deepcopy(item)},
callback=self.detail_parse
)
settings
添加了
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"