使用scrapy爬取苏宁易购图书信息

理论基础详见:https://blog.csdn.net/apollo_miracle/article/details/84987459

# -*- coding: utf-8 -*-
import re
from copy import deepcopy

import scrapy


class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['suning.com']
    start_urls = ['https://book.suning.com/']

    def parse(self, response):
        # 获取大分类的分组
        div_list = response.xpath("//div[@class='left-menu-container']/div[@class='menu-list']/div[@class='menu-item']")
        div_sub_list = response.xpath(
            "//div[@class='left-menu-container']/div[@class='menu-list']/div[@class='menu-sub']")
        for div in div_list:
            item = {}
            # 大分类的名字
            item["b_cate"] = div.xpath(".//h3/a/text()").extract_first()
            # 当前大分类的所有的中间分类的位置
            current_sub_div = div_sub_list[div_list.index(div)]
            # 获取中间分类的分组
            p_list = current_sub_div.xpath(".//div[@class='submenu-left']/p")
            for p in p_list:
                item["m_cate"] = p.xpath(".//a/text()").extract_first()
                # 获取小分类的分组
                s_list = p.xpath("./following-sibling::ul[1]/li")
                for s in s_list:
                    # 小分类的名字
                    item["s_cate"] = s.xpath(".//a/text()").extract_first()
                    # 小分类的URL地址
                    item["s_href"] = s.xpath(".//a/@href").extract_first()

                    # 请求图书的列表页
                    yield scrapy.Request(
                        item["s_href"],
                        callback=self.parse_book_list,
                        meta={"item": deepcopy(item)}
                    )

                    # 发送请求,获取列表页第一页后一部分的数据
                    next_url_temp = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp=0&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0"
                    # 获取url地址的ci
                    ci = item["s_href"].split("-")[1]
                    next_url = next_url_temp.format(ci)
                    yield scrapy.Request(
                        next_url,
                        callback=self.parse_book_list,
                        meta={"item": deepcopy(item)}
                    )

    def parse_book_list(self, response):
        item = response.meta["item"]
        # 获取图书列表页的分组
        # book_list = response.xpath("//div[@id='filter-results']/ul/li")
        book_list = response.xpath("//li[contains(@class,'product      book')]")
        for book in book_list:
            # 书名
            item["book_name"] = book.xpath(".//p[@class='sell-point']/a/text()").extract_first()
            # 书的url地址,不完整
            item["book_href"] = book.xpath(".//p[@class='sell-point']/a/@href").extract_first()
            # 书店名
            item["book_store"] = book.xpath(".//p[@class='seller oh no-more ']/a/text()").extract_first()

            # 发送详情页的请求
            yield response.follow(
                item["book_href"],
                callback=self.parse_book_detail,
                meta={"item": deepcopy(item)}
            )
        # 列表页翻页
        next_page_url_1 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010"
        next_page_url_2 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0"
        # 获取url地址的ci
        ci = item["s_href"].split("-")[1]
        # 当前的页码数
        current_page = re.findall(r'param.currentPage = "(.*?)";', response.body.decode())[0]
        # 总的页码数
        total_page = re.findall(r'param.pageNumbers = "(.*?)";', response.body.decode())[0]
        # print(total_page, "*" * 30)
        while int(current_page) < int(total_page):
            next_page_num = int(current_page) + 1
            # 组装前半部分URL
            next_url_1 = next_page_url_1.format(ci, next_page_num)
            yield scrapy.Request(
                next_url_1,
                callback=self.parse_book_list,
                meta={"item": item}
            )
            # 组装后半部分URL
            next_url_2 = next_page_url_2.format(ci, next_page_num)
            yield scrapy.Request(
                next_url_2,
                callback=self.parse_book_list,
                meta={"item": item}
            )

    def parse_book_detail(self, response):
        """处理图书详情页内容"""
        item = response.meta["item"]
        price_url_temp = "https://pas.suning.com/nspcsale_0_000000000{}_000000000{}_{}_10_010_0100101_226503_1000000_9017_10106_Z001___{}_{}___.html"
        p1 = response.url.split("/")[-1].split(".")[-2]
        p3 = response.url.split("/")[-2]
        p4 = re.findall(r'"catenIds":"(.*?)"', response.body.decode())
        if p4:
            p4 = p4[0]
            p5 = re.findall(r'"weight":"(.*?)"', response.body.decode())[0]
            price_url = price_url_temp.format(p1, p1, p3, p4, p5)
            yield scrapy.Request(
                price_url,
                callback=self.parse_book_pirce,
                meta={"item": item}
            )

    def parse_book_pirce(self, response):
        """提取图书的价格"""
        item = response.meta["item"]
        price = re.findall(r'"netPrice":"(.*?)"', response.body.decode())
        if price:
            item["book_price"] = price[0]
            print(item)
            yield item

 

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 5
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值