Scrapy爬取京东图书信息

最新推荐文章于 2024-05-02 17:41:07 发布

BRUIN.

最新推荐文章于 2024-05-02 17:41:07 发布

阅读量421

点赞数 1

分类专栏： Python爬虫文章标签： xpath url json

本文链接：https://blog.csdn.net/i_i___lo_ve___ya/article/details/105165069

版权

Python爬虫专栏收录该内容

38 篇文章 2 订阅

订阅专栏

网页结构比较简单，需要注意的是，获取价格需要找到它的接口，还需注意的是传递item的时候需要深拷贝，避免item的值重复，以下是spider文件

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import json


class JsbookSpider(scrapy.Spider):
    name = 'jdbook'
    allowed_domains = ['jd.com', 'p.3.cn']
    start_urls = ['https://book.jd.com/booksort.html']

    def parse(self, response):
        item = {}
        dt_list = response.xpath('//div[@id="booksort"]/div[2]/dl/dt')
        # 获取大分类
        for dt in dt_list:
            item['big_sort'] = dt.xpath('./a/text()').extract_first()

            # 获取小分类
            em_list = dt.xpath('./following-sibling::dd[1]/em')
            for em in em_list:
                item['small_sort'] = em.xpath('./a/text()').extract_first()
                item['small_sort_href'] = em.xpath('./a/@href').extract_first()
                if item['small_sort_href'] is not None:
                    item['small_sort_href'] = 'https:' + item['small_sort_href']

                yield scrapy.Request(
                    url=item['small_sort_href'],
                    callback=self.parse_book_page,
                    meta={'item': deepcopy(item)}
                )

    def parse_book_page(self, response):
        item = response.meta.get('item')

        # 获取图书详情
        li_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
        for li in li_list:
            # 书名
            item['book_name'] = li.xpath('./div/div[@class="p-name"]/a/em/text()').extract_first()
            if item['book_name'] is not None:
                item['book_name'] = item['book_name'].strip()
            # 图书详情页URL
            item['book_href'] = li.xpath('./div/div[@class="p-name"]/a/@href').extract_first()
            if item['book_href'] is not None:
                item['book_href'] = 'https:' + item['book_href']
            # 简介
            # item['promo_words'] = li.xpath('./div/div[@class="p-name"]/a/i/text()').extract_first()
            # 作者
            item['book_author'] = li.xpath('./div/div[@class="p-bookdetails"]/span/span/a/@title').extract_first()

            # 获取价格
            num = li.xpath('./div/@data-sku').extract_first()
            if num is not None:
                url = 'https://p.3.cn/prices/mgets?&skuIds=J_{}'.format(num)

                yield scrapy.Request(
                    url=url,
                    callback=self.get_book_price,
                    meta={'item': deepcopy(item)}
                )

        # 获取下一页数据
        next_url = response.xpath('//a[@class="pn-next"]/@href').extract_first()
        if next_url is not None:
            next_url = "http://list.jd.com" + next_url
            print('获取下一页')

            yield scrapy.Request(
                url=next_url,
                callback=self.parse_book_page,
                meta={'item': deepcopy(item)}
            )


    def get_book_price(self, response):
        item = response.meta.get('item')
        item['book_price'] = json.loads(response.text)[0]['op']
        # print(item)

        yield item