京东图书 https://book.jd.com/booksort.html
####目标: 爬取京东图书下面所有图书的详细信息
- 爬取所有大分类下的所有小分类下的图书信息
- 大分类的名字
- 小分类的名字
- 小分类的链接
- 书的名字
- 书店的名字
- 书的链接
- 书对应图片链接
- 书的价格
- 进行翻页请求
代码如下(scrapy):
# -*- coding: utf-8 -*-
import scrapy
from jdBook.items import JdbookItem
import time
import json
import re
class JdbookerSpider(scrapy.Spider):
name = 'jdbooker'
start_urls = ['http://book.jd.com/booksort.html']
list_p = []
page_num = 0
def parse(self, response):
data = response.xpath("//div[@class='mc']//dt//text()").getall()
#for i in range(len(data)):
for i in range(2):
big = data[i]
small = response.xpath("//dd[%d]//em//text()"%(i+1)).getall()
link = response.xpath("//dd[%d]//em//a//@href"%(i+1)).getall()
print(len(link))
for j in range(len(link)):
#for j in range(1):
yield scrapy.Request(url="https:%s"%link[j],meta={'big_sort':big,'small_sort':small[j],'small_url':link[j]},callback=self.product,priority=10)
time.sleep(0.5)
time.sleep(2)
def product(self,response):
m1 = response.meta
big_sort = response.meta['big_sort']
small_sort = response.meta['small_sort']
small_url = response.meta['small_url']
data = response.xpath("//li[@class='gl-item']")
num=0
img = re.findall(r'//img\d\d\.360buyimg.com/n7/.{,80}\.jpg',response.body.decode('utf-8'))
if len(img) >= 60:
for d in data:
book = d.xpath(".//div[@class='p-name']//em//text()").get()
book_url = d.xpath(".//div[@class='p-img']//a//@href").get()
book_img = img[num]
price = d.xpath(".//div//@data-sku").get()
name = book.strip()
yield scrapy.Request(url="https://p.3.cn/prices/mgets?skuIds=%s"%price,meta={'m1':m1,'book':name,'book_img':book_img,'book_url':book_url},callback=self.getprice,priority=5)
#time.sleep(0.2)
num+=1
page = response.xpath("//span[@class='p-num']//a//@href").getall()
if JdbookerSpider.page_num >=4:
JdbookerSpider.page_num=0
else:
if page:
url = page[6]
yield scrapy.Request("https://list.jd.com/%s"%url,meta={'big_sort':big_sort,'small_sort':small_sort,'small_url':small_url,'url':url},callback=self.product)
# JdbookerSpider.page_num+=1
def getprice(self,response):
m2 = response.meta
json_date = json.loads(response.body_as_unicode())
price = json_date[0]['p']
yield scrapy.Request(url="https:%s"%m2['book_url'],meta={'m2':m2,'price':price},callback=self.getshop)
def getshop(self,response):
m3 = response.meta['m2']['m1']
m4 = response.meta['m2']
price = response.meta['price']
shop = response.xpath("//div[@class='name']//a/text()").get()
book = JdbookItem()
book['big_sort'] = m3['big_sort']
book['small_sort'] = m3['small_sort']
book['small_url'] = m3['small_url']
book['book'] = m4['book']
book['book_url'] = m4['book_url']
book['book_img'] = m4['book_img']
book['store'] = shop
book['price'] = price
yield book