scrapy爬取京东所有图书

最新推荐文章于 2024-05-02 17:41:07 发布

孔丘闻言

最新推荐文章于 2024-05-02 17:41:07 发布

阅读量1.3k

点赞数 1

分类专栏：爬虫 python 文章标签： python scrapy book

本文链接：https://blog.csdn.net/xiaodsadwwq/article/details/93797399

版权

python 同时被 2 个专栏收录

24 篇文章 0 订阅

订阅专栏

爬虫

12 篇文章 0 订阅

订阅专栏

京东图书 https://book.jd.com/booksort.html

####目标：爬取京东图书下面所有图书的详细信息

爬取所有大分类下的所有小分类下的图书信息
大分类的名字
小分类的名字
小分类的链接
书的名字
书店的名字
书的链接
书对应图片链接
书的价格
进行翻页请求

代码如下(scrapy)：

# -*- coding: utf-8 -*-
import scrapy
from jdBook.items import JdbookItem
import time
import json
import re


class JdbookerSpider(scrapy.Spider):
	name = 'jdbooker'
	start_urls = ['http://book.jd.com/booksort.html']
	list_p = []
	page_num = 0
	def parse(self, response):
		data = response.xpath("//div[@class='mc']//dt//text()").getall()
		#for i in range(len(data)):
		for i in range(2):
			big = data[i]
			small = response.xpath("//dd[%d]//em//text()"%(i+1)).getall()
			link  = response.xpath("//dd[%d]//em//a//@href"%(i+1)).getall()
			print(len(link))
			for j in range(len(link)):
			#for j in range(1):
				yield scrapy.Request(url="https:%s"%link[j],meta={'big_sort':big,'small_sort':small[j],'small_url':link[j]},callback=self.product,priority=10)
				time.sleep(0.5)	
			time.sleep(2)
	def product(self,response):
		m1 = response.meta
		big_sort = response.meta['big_sort']
		small_sort = response.meta['small_sort']
		small_url = response.meta['small_url']
		
		data = response.xpath("//li[@class='gl-item']")
		num=0
		img = re.findall(r'//img\d\d\.360buyimg.com/n7/.{,80}\.jpg',response.body.decode('utf-8'))
		if len(img) >= 60:
			for d in data:
				book = d.xpath(".//div[@class='p-name']//em//text()").get()
				book_url = d.xpath(".//div[@class='p-img']//a//@href").get()
				book_img = img[num]
				price = d.xpath(".//div//@data-sku").get()
				name = book.strip()	
				yield scrapy.Request(url="https://p.3.cn/prices/mgets?skuIds=%s"%price,meta={'m1':m1,'book':name,'book_img':book_img,'book_url':book_url},callback=self.getprice,priority=5)
				#time.sleep(0.2)
				num+=1
		page = response.xpath("//span[@class='p-num']//a//@href").getall()
		if JdbookerSpider.page_num >=4:
			JdbookerSpider.page_num=0
		else:
			if page:
				url = page[6]
				yield scrapy.Request("https://list.jd.com/%s"%url,meta={'big_sort':big_sort,'small_sort':small_sort,'small_url':small_url,'url':url},callback=self.product)
		#		JdbookerSpider.page_num+=1

	def getprice(self,response):
		m2 = response.meta
		json_date = json.loads(response.body_as_unicode())
		price = json_date[0]['p']
		yield scrapy.Request(url="https:%s"%m2['book_url'],meta={'m2':m2,'price':price},callback=self.getshop)
	def getshop(self,response):
		m3 = response.meta['m2']['m1']
		m4 = response.meta['m2']
		price = response.meta['price']
		shop = response.xpath("//div[@class='name']//a/text()").get()
		book = JdbookItem()
		book['big_sort'] = m3['big_sort']
		book['small_sort'] = m3['small_sort']
		book['small_url'] = m3['small_url']
		book['book'] = m4['book']
		book['book_url'] = m4['book_url']
		book['book_img'] = m4['book_img']
		book['store'] = shop
		book['price'] = price
		yield book