scrapy抓取苏宁所有图书并保存到数据库

scrapy抓取苏宁所有图书

采用mongodb存储爬取图书数据,爬取苏宁全网图书信息(图书名,图书所属详细分类商店,价格)

注意
  1. 需要根据苏宁图书网页分类建立相应分类

  2. 图书详情页每页有60条图书数据,但response.body中只有30条,需要构造url发起请求获取

  3. 实际操作发现–>雅思IELTS的url 地址与其他详情页图书不同,需要单独构造url发起请求

  4. 需将setting中robots协议改为false

  5. 价格抓取图书详情页构造url进行抓取

  6. 在for循环内yield scrapy.Request需用deepcopy

suning.py

import scrapy
import re
from copy import deepcopy

class SuningSpider(scrapy.Spider):
    name = 'suning'
    allowed_domains = ['suning.com']
    start_urls = ['http://book.suning.com/']

    def parse(self, response):
        # 获取大分类
        div_list = response.xpath("//div[@class='menu-list']/div[@class='menu-item']")
        div_sub_list = response.xpath("//div[@class='menu-list']/div[@class='menu-sub']")
        for div in div_list:
            item = {}
            # 大分类的名字
            item['b_cate'] = div.xpath(".//h3/a/text()").get()
            # 获取中介分类的位置
            current_sub_div = div_sub_list[div_list.index(div)]
            # 获取中间分类的分组
            p_list = current_sub_div.xpath(".//div[@class='submenu-left']/p[@class='submenu-item']")
            for p in p_list:
                # 中间分类的名字
                item["m_cate"] = p.xpath("./a/text()").get()
                # 获取小分类的分组
                li_list = p.xpath("./following-sibling::ul[1]/li")
                for li in li_list:
                    # 获取小分类的名字
                    item['s_cate'] = li.xpath("./a/text()").get()
                    # 或取小分类的url
                    item['s_href'] = li.xpath("./a/@href").get()
                    # 请求图书列表页
                    yield scrapy.Request(
                        item['s_href'],
                        callback=self.parse_book_list,
                        meta={"item": deepcopy(item)}
                    )
                    # 发送请求获取列表页后一半内容
                    next_part_url_temp = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp=0&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=394&paging=1&sub=0"
                    ci = item['s_href'].split('-')
                    if len(ci)>1:
                        ci = item['s_href'].split('-')[1]
                        next_part_url = next_part_url_temp.format(ci)
                    else:
                        name = item['s_cate']
                        next_part_url_temp_d = "https://search.suning.com/emall/searchProductList.do?keyword={}&ci=0&pg=01&cp=3&il=0&st=0&iy=0&adNumber=0&n=1&ch=4&sesab=ACAABAABCCAA&id=IDENTIFYING&cc=394&paging=1&sub=0"
                        next_part_url = next_part_url_temp_d.format(name)

                    yield scrapy.Request(
                        next_part_url,
                        callback=self.parse_book_list,
                        meta={"item": deepcopy(item)}

                    )


    def parse_book_list(self, response):
        item = response.meta["item"]
        li_list = response.xpath("//li[contains(@class,'product')]")
        # print(len(li_list))
        # print("*"*100)
        for li in li_list:
            # 书名
            item['book_name'] = li.xpath(".//p[@class='sell-point']/a/text()").get().strip()
            # 书链接地址
            item['book_href'] = li.xpath(".//p[@class='sell-point']/a/@href").get()
            # 书商店名
            item['book_store_name'] = li.xpath(".//p[contains(@class,'seller oh no-more')]/a/text()").get()
            # print(item)
            yield response.follow(
                item['book_href'],
                callback=self.parse_book_detail,
                meta={'item': deepcopy(item)}
            )
        next_url_1 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=394"
        next_url_2 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=394&paging=1&sub=0"
        next_url_1_d = "https://search.suning.com/emall/searchProductList.do?keyword={}&ci={}&pg=01&cp=1&il=0&st=0&iy=0&adNumber=0&n=1&ch=4&sesab=ACAABAABCCAA&id=IDENTIFYING&cc=394"
        next_url_2_d = "https://search.suning.com/emall/searchProductList.do?keyword={}&ci={}&pg=01&cp=1&il=0&st=0&iy=0&adNumber=0&n=1&ch=4&sesab=ACAABAABCCAA&id=IDENTIFYING&cc=394"

        current_Page = re.findall('param.currentPage = "(.*?)";', response.body.decode())[0]
        total_Page = re.findall('param.pageNumbers = "(.*?)";', response.body.decode())[0]
        if int(current_Page) < int(total_Page):
            next_page_num = int(current_Page) + 1
            ci = item['s_href'].split('-')
            if len(ci) > 1:
                ci = item['s_href'].split('-')[1]
                next_url_1 = next_url_1.format(ci, next_page_num)
                next_url_2 = next_url_2.format(ci, next_page_num)
            else:
                name = item['s_cate']
                next_url_1 = next_url_1_d.format(name, next_page_num)
                next_url_2 = next_url_2_d.format(name, next_page_num)
            yield scrapy.Request(
                next_url_1,
                callback=self.parse_book_list,
                meta={"item": item}
            )
            # print("*"*100)
            next_url_2 = next_url_2.format(ci, next_page_num)
            yield scrapy.Request(
                next_url_2,
                callback=self.parse_book_list,
                meta={"item": item}
            )
            # print("?"*100)


    def parse_book_detail(self, response):
        item = response.meta['item']
        price_temp_url = "https://pas.suning.com/nspcsale_0_0000000{}_0000000{}_{}_180_394_3940199_502282_1000118_9118_10920_Z001___{}_{}________0___0.0_2__502320_502687_.html"
        p1 = response.url.split('/')[-1].split('.')[0]
        p3 = response.url.split('/')[-2]
        p4 = re.findall('"catenIds":"(.*?)"', response.body.decode())
        if len(p4)>0:
            p4 = p4[0]
            p5 = re.findall('"weight":"(.*?)"', response.body.decode())[0]
            price_url = price_temp_url.format(p1, p1, p3, p4, p5)
            yield scrapy.Request(
                price_url,
                callback=self.parse_book_price,
                meta={'item': item}
            )


    def parse_book_price(self, response):
        item = response.meta['item']
        item['book_price'] = re.findall('"netPrice":"(.*?)"', response.body.decode())[0]
        yield item




  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值