目标: 爬取苏宁图书下所有书本的系信息
- 爬取每个大分类(文学艺术)下的中间分类(小说)下的每个小分类(中国当代小说)的书本信息,并且进行翻页请求
- 大分类名字
- 中间分类名字
- 小分类名字
- 小分类链接
- 图书标题
- 书店名字
- 书的链接
- 书的价格
- 翻页请求
- 将数据保存到mongodb数据库中
代码如下(scrapy):
import re
from collections import OrderedDict
from copy import deepcopy
from pprint import pprint
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybook'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com/']
def parse(self, response):
type1_lists = response.xpath("/html/body/div[6]/div/div[1]/div[1]/div[1]/div[@class='menu-item']")
print(len(type1_lists))
for type1_list in type1_lists[:2]:
item = {}
item["type1_book"] = type1_list.xpath('dl/dt/h3/a/text()').extract_first()
print(item["type1_book"])
href = type1_list.xpath('dl/dt/h3/a/@href').extract_first()
type2_lists = type1_list.xpath('dl/dd/a')
if not type2_lists:
yield scrapy.Request(
href,
callback=self.parse_detail,
meta={"item": deepcopy(item)}
)
for type2_list in type2_lists[:2]:
item['type2_book'] = type2_list.xpath('text()').extract_first()
item['type2_bookurl'] = type2_list.xpath('@href').extract_first()
print(item['type2_book'])
yield scrapy.Request(
item['type2_bookurl'],
callback=self.parse_detail,
meta={"item":deepcopy(item)}
)
def parse_detail(self,response):
item = response.meta["item"]
type3_lists = response.xpath('//*[@id="search-opt"]/div/dl[2]/dd/div[1]/div/a')
if not type3_lists:
print(response.url)
item["type3_book"] = item["type2_book"]
item["type3_bookurl"] = item["type2_bookurl"]
yield scrapy.Request(
response.url,
callback=self.noType3_book_info,
meta={"item":deepcopy(item)},
dont_filter=True
)
else:
for type3_list in type3_lists[:2]:
item['type3_book'] = type3_list.xpath("text()").extract_first()
item['type3_bookurl']= "https:"+ type3_list.xpath("@href").extract_first()
yield scrapy.Request(
item['type3_bookurl'],
callback=self.book_info,
meta={"item":deepcopy(item)}
)
def noType3_book_info(self,response):
item = response.meta['item']
book_lists = response.xpath('//*[@id="product-list"]/ul/li')
if not book_lists:
yield scrapy.Request(
response.url,
callback=self.book_info,
meta={"item":deepcopy(item)},
dont_filter=True
)
temp_url = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=701&paging=1&sub=0"
if book_lists:
for book_list in book_lists:
item["book_url"] = "https:" + book_list.xpath('div/div/div[1]/div/a/@href').extract_first()
item["book_name"] = book_list.xpath('div/div/div[2]/div[2]/a/text()').extract_first()
item["shopname"] = book_list.xpath('div/div/div[2]/div[4]/a/text()').extract_first()
item['book_img'] = "https:" + book_list.xpath('div/div/div[1]/div/a/img/@src').extract_first()
yield scrapy.Request(
item["book_url"],
callback=self.book_detail,
meta={"item": deepcopy(item)}
)
current_page = int(re.findall("param.currentPage = \"(.*?)\";", response.body.decode())[0])
book_listh = "https://search.suning.com/emall/searchV1Product.do?keyword=%E6%95%A3%E6%96%87%E9%9A%8F%E7%AC%94&ci=0&pg=01&cp={}&il=1&st=0&iy=0&isNoResult=0&n=1&sesab=ACAABAAB&id=IDENTIFYING&cc=701&paging={}&sub=0&jzq=33003"
for i in range(1,4):
href = book_listh.format(current_page,i)
yield scrapy.Request(
href,
callback=self.noType3_bookinfo2,
meta={"item": deepcopy(item)}
)
page_count = int(re.findall("param.pageNumbers = \"(.*?)\";", response.body.decode())[0])
ci = item["type2_bookurl"].split("/")[-2]
next_u = "https://search.suning.com//{}/&iy=0&isNoResult=0&cp={}"
if current_page < page_count - 1 and current_page < 5:
print(ci)
current_page += 1
next_url = next_u.format(ci, current_page)
yield scrapy.Request(
next_url,
callback=self.noType3_book_info,
meta={"item": deepcopy(item)}
)
def book_info(self,response):
item = response.meta['item']
print("book_info")
book_lists = response.xpath('//*[@id="filter-results"]/ul/li')
temp_url = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=701&paging=1&sub=0"
for book_list in book_lists:
item["book_url"] ="https:"+book_list.xpath('div/div/div/div[1]/div/a/@href').extract_first()
item["book_name"]= book_list.xpath('div/div/div/div[2]/p[2]/a/text()').extract_first()
item["shopname"] = book_list.xpath('div/div/div/div[2]/p[4]/a/text()').extract_first()
item['book_img'] ="https:"+book_list.xpath('div/div/div/div[1]/div/a/img/@src2').extract_first()
yield scrapy.Request(
item["book_url"],
callback=self.book_detail,
meta={"item": deepcopy(item)}
)
current_page = int(re.findall("param.currentPage = \"(.*?)\";", response.body.decode())[0])
ci = item["type3_bookurl"].split("-")[1]
next_booklist = temp_url.format(ci,current_page)
yield scrapy.Request(
next_booklist,
callback=self.book_info2,
meta={"item": deepcopy(item)}
)
page_count = int(re.findall("param.pageNumbers = \"(.*?)\";", response.body.decode())[0])
next_u="https://list.suning.com/1-{}-{}.html#search-path-box"
if current_page < page_count-1 and current_page < 5:
current_page+=1
next_url = next_u.format(ci,current_page)
print(next_url)
yield scrapy.Request(
next_url,
callback=self.book_info,
meta={"item": deepcopy(item)}
)
def noType3_bookinfo2(self,response):
if response.xpath('/html/body/li'):
item = response.meta['item']
book_lists = response.xpath('/html/body/li')
print("noType3_bookinfo2"+str(len(book_lists)))
for book_list in book_lists:
item["book_url"] = "https:" + book_list.xpath('div/div/div[1]/div/a/@href').extract_first()
item["book_name"] = book_list.xpath('div/div/div[2]/div[2]/a/text()').extract_first()
item["shopname"] = book_list.xpath('div/div/div[2]/div[4]/a/text()').extract_first()
item['book_img'] = "https:" + book_list.xpath('div/div/div[1]/div/a/img/@src').extract_first()
yield scrapy.Request(
item["book_url"],
callback=self.book_detail,
meta={"item": deepcopy(item)}
)
def book_info2(self,response):
if response.xpath('/html/body/li'):
item = response.meta['item']
book_lists = response.xpath('/html/body/li')
for book_list in book_lists:
item["book_url"] = "https:" + book_list.xpath('div/div/div/div[1]/div/a/@href').extract_first()
item["book_name"] = book_list.xpath('div/div/div/div[2]/p[2]/a/text()').extract_first()
item["shopname"] = book_list.xpath('div/div/div/div[2]/p[4]/a/text()').extract_first()
item['book_img'] = "https:" + book_list.xpath('div/div/div/div[1]/div/a/img/@src2').extract_first()
yield scrapy.Request(
item["book_url"],
callback=self.book_detail,
meta={"item": deepcopy(item)}
)
def book_detail(self,response):
item = response.meta['item']
price_temp_url = "https://pas.suning.com/nspcsale_0_0000000{}_000000000{}_{}_140_701_7010101_502282_1000186_9186_11475_Z001___{}_{}___.html?callback=pcData&_=1559546689850"
p1 = response.url.split("/")[-1].split(".")[0]
p3 = response.url.split("/")[-2]
p4 = re.findall(r'"catenIds":"(.*?)"', response.body.decode())[0]
p5 = re.findall(r'"weight":"(.*?)"', response.body.decode())[0]
price_temp_url = price_temp_url.format(p1, p1, p3, p4, p5)
yield scrapy.Request(
price_temp_url,
callback=self.book_price,
meta={"item": deepcopy(item)}
)
def book_price(self, response):
item = response.meta['item']
try:
price = re.findall(r'"netPrice":"(.*?)"', response.body.decode())[0]
item["book_price"] = price
except:
price_temp_url2=re.sub("000000000","0000000",response.url)
yield scrapy.Request(
price_temp_url2,
callback=self.book_price2,
meta={"item": deepcopy(item)}
)
else:
yield item
def book_price2(self,response):
item = response.meta['item']
item["book_price"] = re.findall('"netPrice":"(.*?)"',response.body.decode())[0]
yield item
git地址
如要代码优化或者其他问题可以联系本人邮箱:zh15270924273@163.com