scrapy爬取苏宁所有图书

最新推荐文章于 2022-11-20 16:36:28 发布
孔丘闻言
最新推荐文章于 2022-11-20 16:36:28 发布
阅读量2k
点赞数 2
分类专栏：爬虫 python 文章标签： python book scrapy
本文链接：https://blog.csdn.net/xiaodsadwwq/article/details/93796916
版权
python 同时被 2 个专栏收录
24 篇文章 0 订阅
订阅专栏
爬虫
12 篇文章 0 订阅
订阅专栏
苏宁图书 https://book.suning.com/

目标: 爬取苏宁图书下所有书本的系信息

爬取每个大分类(文学艺术)下的中间分类(小说)下的每个小分类(中国当代小说)的书本信息,并且进行翻页请求
大分类名字
中间分类名字
小分类名字
小分类链接
图书标题
书店名字
书的链接
书的价格
翻页请求
将数据保存到mongodb数据库中
代码如下(scrapy):

# -*- coding: utf-8 -*-
import re
from collections import OrderedDict
from copy import deepcopy
from pprint import pprint

import scrapy


class MybookSpider(scrapy.Spider):
    name = 'mybook'
    allowed_domains = ['suning.com']
    start_urls = ['https://book.suning.com/']

    def parse(self, response):
        type1_lists = response.xpath("/html/body/div[6]/div/div[1]/div[1]/div[1]/div[@class='menu-item']")
        print(len(type1_lists))
        # 大分类遍历
        for type1_list in type1_lists[:2]:
            item = {}
            # item = OrderedDict(item)
            item["type1_book"] = type1_list.xpath('dl/dt/h3/a/text()').extract_first()
            print(item["type1_book"])
            href = type1_list.xpath('dl/dt/h3/a/@href').extract_first()
            # print(href)
            # # print(item["type1_book"])
            type2_lists = type1_list.xpath('dl/dd/a')
            if not type2_lists:
                yield scrapy.Request(
                    href,
                    callback=self.parse_detail,
                    meta={"item": deepcopy(item)}
                )
            # # 中间分类遍历
            for type2_list in type2_lists[:2]:
                item['type2_book'] = type2_list.xpath('text()').extract_first()
                item['type2_bookurl'] = type2_list.xpath('@href').extract_first()
                print(item['type2_book'])
                # if item['type2_bookurl'] :
                # print(item['type2_bookurl']+"ss")
                yield scrapy.Request(
                    item['type2_bookurl'],
                    callback=self.parse_detail,
                    meta={"item":deepcopy(item)}
                )


    def parse_detail(self,response):
        item = response.meta["item"]
        # 翻页标记
        # item["pagenum"] = 0
        # print(response.xpath('//*[@id="search-path"]/dl/dt/a/text()').extract_first())
        type3_lists = response.xpath('//*[@id="search-opt"]/div/dl[2]/dd/div[1]/div/a')
        #如果没有小分类  则直接进行中间分类书籍信息爬取
        if not type3_lists:
            print(response.url)
            item["type3_book"] = item["type2_book"]
            item["type3_bookurl"] = item["type2_bookurl"]
            yield scrapy.Request(
                response.url,
                callback=self.noType3_book_info,
                meta={"item":deepcopy(item)},
                dont_filter=True
            )
        else:
            #有小分类 则进行小分类遍历 爬取前面两个
            for type3_list in type3_lists[:2]:
                item['type3_book'] = type3_list.xpath("text()").extract_first()
                item['type3_bookurl']= "https:"+ type3_list.xpath("@href").extract_first()
                # print("https:"+item['type3_bookurl'])
                # url = "https://list.suning.com/1-503099-0-0-0-0-0-0-0-4.html"
                yield scrapy.Request(
                    item['type3_bookurl'],
                    callback=self.book_info,
                    meta={"item":deepcopy(item)}
                )
#无小分类 书的相关信息   因为有小分类和无小分类网页body不一样
    def noType3_book_info(self,response):
        item = response.meta['item']
        book_lists = response.xpath('//*[@id="product-list"]/ul/li')
        #在无小分类中  少儿的body又和又小分类的body一样  所以在请求并给book_info
        if not book_lists:
            yield scrapy.Request(
                    response.url,
                    callback=self.book_info,
                    meta={"item":deepcopy(item)},
                    dont_filter=True#
                )
            # print("ss")
        # if book_lists:
        #     book_lists = response.xpath('//*[@id="product-list"]/ul/li')
        # 后面三十本
        temp_url = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=701&paging=1&sub=0"
        # print(len(book_lists))
        # 前面三十本书的遍历
        if book_lists:
            # print("sss")
            for book_list in book_lists:
                item["book_url"] = "https:" + book_list.xpath('div/div/div[1]/div/a/@href').extract_first()
                item["book_name"] = book_list.xpath('div/div/div[2]/div[2]/a/text()').extract_first()
                item["shopname"] = book_list.xpath('div/div/div[2]/div[4]/a/text()').extract_first()
                item['book_img'] = "https:" + book_list.xpath('div/div/div[1]/div/a/img/@src').extract_first()
                yield scrapy.Request(
                    item["book_url"],
                    callback=self.book_detail,
                    meta={"item": deepcopy(item)}

                )
            current_page = int(re.findall("param.currentPage = \"(.*?)\";", response.body.decode())[0])

            #后面90本书请求   分三次请求
            book_listh = "https://search.suning.com/emall/searchV1Product.do?keyword=%E6%95%A3%E6%96%87%E9%9A%8F%E7%AC%94&ci=0&pg=01&cp={}&il=1&st=0&iy=0&isNoResult=0&n=1&sesab=ACAABAAB&id=IDENTIFYING&cc=701&paging={}&sub=0&jzq=33003"
            for i in range(1,4):
                href = book_listh.format(current_page,i)
                yield scrapy.Request(
                    href,
                    callback=self.noType3_bookinfo2,
                    meta={"item": deepcopy(item)}
                )

            # 下一页 翻页

            page_count = int(re.findall("param.pageNumbers = \"(.*?)\";", response.body.decode())[0])
            ci = item["type2_bookurl"].split("/")[-2]

            next_u = "https://search.suning.com//{}/&iy=0&isNoResult=0&cp={}"
            if current_page < page_count - 1 and current_page < 5:
                print(ci)
                # print("next")
                current_page += 1
                # item["pagenum"] = current_page
                next_url = next_u.format(ci, current_page)
                # print(next_url)
                yield scrapy.Request(
                    next_url,
                    callback=self.noType3_book_info,
                    meta={"item": deepcopy(item)}
                )


#有小分类 书的相关信息
    def book_info(self,response):
        item = response.meta['item']
        # print(item)
        print("book_info")
        #通过小分类url请求
        book_lists = response.xpath('//*[@id="filter-results"]/ul/li')
        # if book_lists:
        #     book_lists = response.xpath('//*[@id="product-list"]/ul/li')
        #后面三十本
        temp_url = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=701&paging=1&sub=0"
        # print(len(book_lists))
        #前面三十本书的遍历
        for book_list in book_lists:

            item["book_url"] ="https:"+book_list.xpath('div/div/div/div[1]/div/a/@href').extract_first()
            item["book_name"]= book_list.xpath('div/div/div/div[2]/p[2]/a/text()').extract_first()
            item["shopname"] = book_list.xpath('div/div/div/div[2]/p[4]/a/text()').extract_first()
            item['book_img'] ="https:"+book_list.xpath('div/div/div/div[1]/div/a/img/@src2').extract_first()
            yield scrapy.Request(
                item["book_url"],
                callback=self.book_detail,
                meta={"item": deepcopy(item)}

            )
            # print(item["book_url"])
        # pprint(item)
        #当前页数
        current_page = int(re.findall("param.currentPage = \"(.*?)\";", response.body.decode())[0])
        #后面三十页书的请求
        ci = item["type3_bookurl"].split("-")[1]
        next_booklist = temp_url.format(ci,current_page)
        yield scrapy.Request(
            next_booklist,
            callback=self.book_info2,
            meta={"item": deepcopy(item)}
        )
        # print(next_booklist)



        #下一页 翻页
        page_count = int(re.findall("param.pageNumbers = \"(.*?)\";", response.body.decode())[0])


        next_u="https://list.suning.com/1-{}-{}.html#search-path-box"
        #获取图书列表 前六页的信息
        if current_page < page_count-1 and current_page < 5:
            current_page+=1
            # item["pagenum"] = current_page
            next_url = next_u.format(ci,current_page)
            print(next_url)
            yield scrapy.Request(
                next_url,
                callback=self.book_info,
                meta={"item": deepcopy(item)}
            )

    def noType3_bookinfo2(self,response):
        if response.xpath('/html/body/li'):
            # print("qqqqq")
            item = response.meta['item']
            # print(item)

            book_lists = response.xpath('/html/body/li')
            # 后面三十页
            # temp_url = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=701&paging=1&sub=0"
            print("noType3_bookinfo2"+str(len(book_lists)))
            for book_list in book_lists:
                item["book_url"] = "https:" + book_list.xpath('div/div/div[1]/div/a/@href').extract_first()
                item["book_name"] = book_list.xpath('div/div/div[2]/div[2]/a/text()').extract_first()
                item["shopname"] = book_list.xpath('div/div/div[2]/div[4]/a/text()').extract_first()
                item['book_img'] = "https:" + book_list.xpath('div/div/div[1]/div/a/img/@src').extract_first()
                yield scrapy.Request(
                    item["book_url"],
                    callback=self.book_detail,
                    meta={"item": deepcopy(item)}

                )


#获取后面三十页的书信息
    def book_info2(self,response):
        if response.xpath('/html/body/li'):
            # print("qqqqq")
            item = response.meta['item']
            # print(item)

            book_lists = response.xpath('/html/body/li')
            # 后面三十页
            # temp_url = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=701&paging=1&sub=0"
            # print(len(book_lists))
            for book_list in book_lists:
                item["book_url"] = "https:" + book_list.xpath('div/div/div/div[1]/div/a/@href').extract_first()
                item["book_name"] = book_list.xpath('div/div/div/div[2]/p[2]/a/text()').extract_first()
                item["shopname"] = book_list.xpath('div/div/div/div[2]/p[4]/a/text()').extract_first()
                item['book_img'] = "https:" + book_list.xpath('div/div/div/div[1]/div/a/img/@src2').extract_first()
                yield scrapy.Request(
                    item["book_url"],
                    callback=self.book_detail,
                    meta={"item": deepcopy(item)}

                )
                # print(item["book_url"])
            # pprint(item)

#图书详情页
    def book_detail(self,response):
        item = response.meta['item']

            #只能通过抓包获取url   并对比多个url 分析网址的不同
        price_temp_url = "https://pas.suning.com/nspcsale_0_0000000{}_000000000{}_{}_140_701_7010101_502282_1000186_9186_11475_Z001___{}_{}___.html?callback=pcData&_=1559546689850"
        p1 = response.url.split("/")[-1].split(".")[0]
        p3 = response.url.split("/")[-2]
        p4 = re.findall(r'"catenIds":"(.*?)"', response.body.decode())[0]
        p5 = re.findall(r'"weight":"(.*?)"', response.body.decode())[0]
        price_temp_url = price_temp_url.format(p1, p1, p3, p4, p5)
        # print(price_temp_url2 )
        yield scrapy.Request(
            price_temp_url,
            callback=self.book_price,
            meta={"item": deepcopy(item)}
        )


#第一种价格url https://pas.suning.com/nspcsale_0_0000000{}_000000000{}_{}_140_701_7010101_502282_1000186_9186_11475_Z001___{}_{}___.html?callback=pcData&_=1559546689850
    def book_price(self, response):
        item = response.meta['item']

        try:
            price = re.findall(r'"netPrice":"(.*?)"', response.body.decode())[0]
            item["book_price"] = price

        except:
        # 只能通过抓包获取url   并对比多个url 分析网址的不同  有两种情况
        #     print("LLLLL"+re.sub("000000000","0000000",response.url))
            price_temp_url2=re.sub("000000000","0000000",response.url)
            # print(price_temp_url2 )
            yield scrapy.Request(
                price_temp_url2,
                callback=self.book_price2,
                meta={"item": deepcopy(item)}
            )
        else:
            # print(item)
            yield item

#第二种url https://pas.suning.com/nspcsale_0_0000000{}_0000000{}_{}_140_701_7010101_502282_1000186_9186_11475_Z001___{}_{}___.html?callback=pcData&_=1559546689850
    def book_price2(self,response):
        item = response.meta['item']

        item["book_price"] = re.findall('"netPrice":"(.*?)"',response.body.decode())[0]

            # item['book_price'] = None
        # print(item)
        yield  item



#后面请求图书的url分析
# https://list.suning.com/emall/showProductList.do?ci=502689&pg=03&cp=0&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=701&paging=1&sub=0
# https://list.suning.com/emall/showProductList.do?ci=502689&pg=03&cp=1&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=701&paging=1&sub=0

#通过抓包获取价格的url分析
# https://pas.suning.com/nspcsale_0_000000011185244882_000000011185244882_0070858148_140_701_7010101_502282_1000186_9186_11475_Z001___R9011209_0.01___.html?callback=pcData&_=1560741070758
# https://pas.suning.com/nspcsale_0_000000010955997512_000000010955997512_0070727362_140_701_7010101_502282_1000186_9186_11475_Z001___R9011209_0.01___.html?callback=pcData&_=1560741256302
# https://pas.suning.com/nspcsale_0_000000011035329752_000000011035329752_0070718557_140_701_7010101_502282_1000186_9186_11475_Z001___R9011215_0.01___.html?callback=pcData&_=1560741476523
# https://pas.suning.com/nspcsale_0_000000011028057652_000000011028057652_0070683977_140_701_7010101_502282_1000186_9186_11475_Z001___R9011215_1.0___.html?callback=pcData&_=1560741579911