网页结构比较简单,需要注意的是,获取价格需要找到它的接口,还需注意的是传递item的时候需要深拷贝,避免item的值重复,以下是spider文件
# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import json
class JsbookSpider(scrapy.Spider):
name = 'jdbook'
allowed_domains = ['jd.com', 'p.3.cn']
start_urls = ['https://book.jd.com/booksort.html']
def parse(self, response):
item = {}
dt_list = response.xpath('//div[@id="booksort"]/div[2]/dl/dt')
# 获取大分类
for dt in dt_list:
item['big_sort'] = dt.xpath('./a/text()').extract_first()
# 获取小分类
em_list = dt.xpath('./following-sibling::dd[1]/em')
for em in em_list:
item['small_sort'] = em.xpath('./a/text()').extract_first()
item['small_sort_href'] = em.xpath('./a/@href').extract_first()
if item['small_sort_href'] is not None:
item['small_sort_href'] = 'https:' + item['small_sort_href']
yield scrapy.Request(
url=item['small_sort_href'],
callback=self.parse_book_page,
meta={'item': deepcopy(item)}
)
def parse_book_page(self, response):
item = response.meta.get('item')
# 获取图书详情
li_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
for li in li_list:
# 书名
item['book_name'] = li.xpath('./div/div[@class="p-name"]/a/em/text()').extract_first()
if item['book_name'] is not None:
item['book_name'] = item['book_name'].strip()
# 图书详情页URL
item['book_href'] = li.xpath('./div/div[@class="p-name"]/a/@href').extract_first()
if item['book_href'] is not None:
item['book_href'] = 'https:' + item['book_href']
# 简介
# item['promo_words'] = li.xpath('./div/div[@class="p-name"]/a/i/text()').extract_first()
# 作者
item['book_author'] = li.xpath('./div/div[@class="p-bookdetails"]/span/span/a/@title').extract_first()
# 获取价格
num = li.xpath('./div/@data-sku').extract_first()
if num is not None:
url = 'https://p.3.cn/prices/mgets?&skuIds=J_{}'.format(num)
yield scrapy.Request(
url=url,
callback=self.get_book_price,
meta={'item': deepcopy(item)}
)
# 获取下一页数据
next_url = response.xpath('//a[@class="pn-next"]/@href').extract_first()
if next_url is not None:
next_url = "http://list.jd.com" + next_url
print('获取下一页')
yield scrapy.Request(
url=next_url,
callback=self.parse_book_page,
meta={'item': deepcopy(item)}
)
def get_book_price(self, response):
item = response.meta.get('item')
item['book_price'] = json.loads(response.text)[0]['op']
# print(item)
yield item