Scrapy爬一下苏宁图书

最新推荐文章于 2022-11-20 16:36:28 发布

dh0805dh

最新推荐文章于 2022-11-20 16:36:28 发布

阅读量165

点赞数

分类专栏： python自学笔记 python自学练习文章标签： python scrapy

本文链接：https://blog.csdn.net/dh0805dh/article/details/90109261

版权

python自学笔记同时被 2 个专栏收录

26 篇文章 0 订阅

订阅专栏

python自学练习

15 篇文章 0 订阅

订阅专栏

别说了，GKD，滴滴学生卡

不上注释了哦如果那里我写的不明白就留言哈

# -*- coding: utf-8 -*-
import scrapy,re
from copy import deepcopy
# from book.items import BookItem

class SuningSpider(scrapy.Spider):
    name = 'suning'
    allowed_domains = ['suning.com']
    start_urls = ['https://book.suning.com']

    def parse(self, response):
        dl_list = response.xpath('//div[@class="menu-list"]//dl')
        for dl in dl_list:
            item = {}
            item["menu_list"] = dl.xpath('./dt/h3/a/text()').extract_first()
            for dd in dl_list:
                item["small_list"] = dd.xpath('./dd/a/text()').extract_first()
                item["small_href"] = dd.xpath('./dd/a/@href').extract_first()
                yield scrapy.Request(
                    item["small_href"],
                    callback=self.parse_smallhref,
                    meta={"item":deepcopy(item)}
                )

    def parse_smallhref(self,response):
        item = response.meta["item"]
        li_list = response.xpath('//div[@id="filter-results"]/ul')
        for li in li_list:
            item["book_name"] = li.xpath('.//div[@class="res-info"]/p[2]/a/text()').extract_first()
            item["book_href"] = 'https:' + li.xpath('.//div[@class="res-info"]/p[2]/a/@href').extract_first()
            yield scrapy.Request(
                item["book_href"],
                callback=self.parse_detail,
                meta={"item":deepcopy(item)}
            )

        currentpage = int(re.findall('param.currentPage = "(.*?)"',response.text)[0])
        pagenum = int(re.findall('param.pageNumbers = "(.*?)"',response.text)[0])
        nextpage = currentpage+1
        categoryId = re.findall('"categoryId": "(.*?)"',response.text)[0]
        if currentpage < pagenum:
            next_url = 'https://list.suning.com/1-{}-{}.html'.format(categoryId,str(nextpage))
            yield scrapy.Request(
                next_url,
                callback=self.parse_smallhref,
                meta={"item":deepcopy(item)}
            )

    def parse_detail(self,response):
        item = response.meta["item"]
        item["book_price"] = re.findall('"itemPrice":"(.*?)"',response.text)[0]
        item["book_author"] = response.xpath('//ul[@class="bookcon-param clearfix"]/li[1]/span/text()').extract_first()
        item["book_public"] = response.xpath('//ul[@class="bookcon-param clearfix"]/li[2]/text()').extract_first()
        yield deepcopy(item)