# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider
class DdSpider(RedisSpider):
name = 'dd'
allowed_domains = ['dangdang.com']
# start_urls = ['http://book.dangdang.com/']
#lpush dangdang http://book.dangdang.com/ 在redis终端输入start地址
redis_key = "dangdang"
def parse(self, response):
dl_list = response.xpath("//div[@class='con flq_body']/div")[1:-1]#第一个和最后一个数据不抓取
for dl in dl_list:
item = {}
item["b_cate"] = dl.xpath(".//dl[contains(@class,'primary_dl')]/dt//text()").extract()
# 获取中间件分类的分组
dll_list = response.xpath(".//dl[@class='inner_dl']")
for dl in dll_list:
#获取中间件分类的名字
item["m_cate"] = dl.xpath("./dt//text()").extract()
#获取小分类的分组
a_list = dl.xpath("./dd/a")
for a in a_list:
#获取小分类的名称
item["s_cate"] = a.xpath("./text()").extract_first()
#获取小分类的url
item["s_href"] = a.xpath("./@href").extract_first()
#发送小分类的url地址请求,到达列表页
yield scrapy.Request(
item["s_href"],
callback=self.parse_book_list,
meta={"item": deepcopy(item)}
)
def parse_book_list(self, response): # 提取列表页的数据
#接收传递的参数
item = response.meta["item"]
# 获取列表页图书的分组
li_list = response.xpath("//ul[@class='bigimg']/li")
for li in li_list:
item["book_name"] = li.xpath("./a/@title").extract_first()
item["book_href"] = li.xpath("./a/@href").extract_first()
item["book_author"] = li.xpath(".//p[@class='search_book_author']/span[1]/a/@title").extract()
item["book_pub_data"] = li.xpath(".//p[@class='search_book_author']/span[2]/text()").extract_first()
item["book_press"] = li.xpath(".//p[@class='search_book_author']/span[3]/a/@title").extract_first()
item["book_desc"] = li.xpath(".//p[@class='detail']/text()").extract_first()
item["book_price"] = li.xpath(".//p[@class='price']/span[1]/text()").extract_first()
item["book_store_name"] = li.xpath(".//span[@class='new_lable']/span[1]/text()").extract_first()
item["book_store_name"] = "当当自营" if item["book_store_name"] is None else item["book_store_name"]
yield item
# 实现列表的翻页
next_url = response.xpath("//li[@class='next']/a/@href").extract_first()
if not next_url:
# 构造请求页
yield scrapy.Request(
next_url,
callback=self.parse_book_list,
meta={"item": item}
)
pipeline 管道数据处理
import re
class DangdangPipeline(object):
def process_item(self, item, spider):
item["b_cate"] = self.process_content(item["b_cate"])
item["m_cate"] = self.process_content(item["m_cate"])
print(item)
return item
def process_content(self, content):#处理content字段的数据
content = [re.sub("\\r\\n| ", "", i) for i in content]#替换字符串中的\xa0|\s字段
content = [i for i in content if len(i)>0]#通过遍历将空格字符删除
return content