京东图书爬虫
# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import json
class JjdSpider(scrapy.Spider):
name = 'jjd'
allowed_domains = ['jd.com', 'p.3.cn']
start_urls = ['https://book.jd.com/booksort.html']
def parse(self, response):# 提取所有大分类和所有小分类
# 获取大分类的分组
div_list = response.xpath("//div[@class='mc']/dl/dt")
for dt in div_list:
item = {}
item["b_cate"] = dt.xpath("./a/text()").extract_first()
# 获取小分类的分组
em_list = dt.xpath("./following-sibling::*[1]/em")
for em in em_list:
#小分类的地址
item["s_href"] = "https:"+em.xpath("./a/@href").extract_first()
item["s_cate"] = em.xpath("./a/text()").extract_first()
#构造小分类的url地址的请求,能够进入列表页
yield scrapy.Request(
item["s_href"],
callback=self.parse_book_list,
meta={"item": deepcopy(item)}
)
def parse_book_list(self, response):
item = response.meta["item"]
#图书列表页书的分组
li_list = response.xpath("//div[@id='plist']/ul/li")
for li in li_list:
item["book_name"] = li.xpath(".//div[@class='p-name']/a/em/text()").extract_first().strip()
item["book_author"] = li.xpath(".//span[@class='p-bi-name']/span/a/text()").extract_first()
item["book_press"] = li.xpath(".//span[@class='p-bi-store']/a/text()").extract_first()
item["book_pub_data"] = li.xpath(".//span[@class='p-bi-date']/text()").extract_first().strip()
item["book_sku"] = li.xpath("./div/@data-sku").extract_first()
# item["book_price"] = li.xpath(".//div[@class='p-price']/strong/i/text()").extract_first()
#获取价格的url地址
price_url = "https://p.3.cn/prices/mgets?&ext=11101000&pin=&type=1&area=1_72_4137_0&skuIds=J_{}"
#填充完整价格的url地址
price_url_temp = price_url.format(item["book_sku"])
#发送请求获取价格
yield scrapy.Request(
price_url_temp,
callback=self.parse_book_price,
meta={"item":deepcopy(item)}
)
#实现翻页
next_url = response.xpath(".//a[@class='pn-next']/@href").extract_first()
if not next_url:
yield response.follow(
next_url,
callback= self.parse_book_list,
meta = {"item": item}
)
def parse_book_price(self, response):
item = response.meta["item"]
#[{"op":"62.80","m":"93.00","id":"J_11757834","p":"62.80"}]
item["book_price"] = json.loads(response.body.decode())[0]["op"]
yield item