理论基础详见:https://blog.csdn.net/apollo_miracle/article/details/84987459
# -*- coding: utf-8 -*-
import re
from copy import deepcopy
import scrapy
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com/']
def parse(self, response):
# 获取大分类的分组
div_list = response.xpath("//div[@class='left-menu-container']/div[@class='menu-list']/div[@class='menu-item']")
div_sub_list = response.xpath(
"//div[@class='left-menu-container']/div[@class='menu-list']/div[@class='menu-sub']")
for div in div_list:
item = {}
# 大分类的名字
item["b_cate"] = div.xpath(".//h3/a/text()").extract_first()
# 当前大分类的所有的中间分类的位置
current_sub_div = div_sub_list[div_list.index(div)]
# 获取中间分类的分组
p_list = current_sub_div.xpath(".//div[@class='submenu-left']/p")
for p in p_list:
item["m_cate"] = p.xpath(".//a/text()").extract_first()
# 获取小分类的分组
s_list = p.xpath("./following-sibling::ul[1]/li")
for s in s_list:
# 小分类的名字
item["s_cate"] = s.xpath(".//a/text()").extract_first()
# 小分类的URL地址
item["s_href"] = s.xpath(".//a/@href").extract_first()
# 请求图书的列表页
yield scrapy.Request(
item["s_href"],
callback=self.parse_book_list,
meta={"item": deepcopy(item)}
)
# 发送请求,获取列表页第一页后一部分的数据
next_url_temp = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp=0&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0"
# 获取url地址的ci
ci = item["s_href"].split("-")[1]
next_url = next_url_temp.format(ci)
yield scrapy.Request(
next_url,
callback=self.parse_book_list,
meta={"item": deepcopy(item)}
)
def parse_book_list(self, response):
item = response.meta["item"]
# 获取图书列表页的分组
# book_list = response.xpath("//div[@id='filter-results']/ul/li")
book_list = response.xpath("//li[contains(@class,'product book')]")
for book in book_list:
# 书名
item["book_name"] = book.xpath(".//p[@class='sell-point']/a/text()").extract_first()
# 书的url地址,不完整
item["book_href"] = book.xpath(".//p[@class='sell-point']/a/@href").extract_first()
# 书店名
item["book_store"] = book.xpath(".//p[@class='seller oh no-more ']/a/text()").extract_first()
# 发送详情页的请求
yield response.follow(
item["book_href"],
callback=self.parse_book_detail,
meta={"item": deepcopy(item)}
)
# 列表页翻页
next_page_url_1 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010"
next_page_url_2 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0"
# 获取url地址的ci
ci = item["s_href"].split("-")[1]
# 当前的页码数
current_page = re.findall(r'param.currentPage = "(.*?)";', response.body.decode())[0]
# 总的页码数
total_page = re.findall(r'param.pageNumbers = "(.*?)";', response.body.decode())[0]
# print(total_page, "*" * 30)
while int(current_page) < int(total_page):
next_page_num = int(current_page) + 1
# 组装前半部分URL
next_url_1 = next_page_url_1.format(ci, next_page_num)
yield scrapy.Request(
next_url_1,
callback=self.parse_book_list,
meta={"item": item}
)
# 组装后半部分URL
next_url_2 = next_page_url_2.format(ci, next_page_num)
yield scrapy.Request(
next_url_2,
callback=self.parse_book_list,
meta={"item": item}
)
def parse_book_detail(self, response):
"""处理图书详情页内容"""
item = response.meta["item"]
price_url_temp = "https://pas.suning.com/nspcsale_0_000000000{}_000000000{}_{}_10_010_0100101_226503_1000000_9017_10106_Z001___{}_{}___.html"
p1 = response.url.split("/")[-1].split(".")[-2]
p3 = response.url.split("/")[-2]
p4 = re.findall(r'"catenIds":"(.*?)"', response.body.decode())
if p4:
p4 = p4[0]
p5 = re.findall(r'"weight":"(.*?)"', response.body.decode())[0]
price_url = price_url_temp.format(p1, p1, p3, p4, p5)
yield scrapy.Request(
price_url,
callback=self.parse_book_pirce,
meta={"item": item}
)
def parse_book_pirce(self, response):
"""提取图书的价格"""
item = response.meta["item"]
price = re.findall(r'"netPrice":"(.*?)"', response.body.decode())
if price:
item["book_price"] = price[0]
print(item)
yield item