scrapy抓取苏宁所有图书
采用mongodb存储爬取图书数据,爬取苏宁全网图书信息(图书名,图书所属详细分类商店,价格)
注意
-
需要根据苏宁图书网页分类建立相应分类
-
图书详情页每页有60条图书数据,但response.body中只有30条,需要构造url发起请求获取
-
实际操作发现–>雅思IELTS的url 地址与其他详情页图书不同,需要单独构造url发起请求
-
需将setting中robots协议改为false
-
价格抓取图书详情页构造url进行抓取
-
在for循环内yield scrapy.Request需用deepcopy
suning.py
import scrapy
import re
from copy import deepcopy
class SuningSpider(scrapy.Spider):
name = 'suning'
allowed_domains = ['suning.com']
start_urls = ['http://book.suning.com/']
def parse(self, response):
# 获取大分类
div_list = response.xpath("//div[@class='menu-list']/div[@class='menu-item']")
div_sub_list = response.xpath("//div[@class='menu-list']/div[@class='menu-sub']")
for div in div_list:
item = {}
# 大分类的名字
item['b_cate'] = div.xpath(".//h3/a/text()").get()
# 获取中介分类的位置
current_sub_div = div_sub_list[div_list.index(div)]
# 获取中间分类的分组
p_list = current_sub_div.xpath(".//div[@class='submenu-left']/p[@class='submenu-item']")
for p in p_list:
# 中间分类的名字
item["m_cate"] = p.xpath("./a/text()").get()
# 获取小分类的分组
li_list = p.xpath("./following-sibling::ul[1]/li")
for li in li_list:
# 获取小分类的名字
item['s_cate'] = li.xpath("./a/text()").get()
# 或取小分类的url
item['s_href'] = li.xpath("./a/@href").get()
# 请求图书列表页
yield scrapy.Request(
item['s_href'],
callback=self.parse_book_list,
meta={"item": deepcopy(item)}
)
# 发送请求获取列表页后一半内容
next_part_url_temp = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp=0&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=394&paging=1&sub=0"
ci = item['s_href'].split('-')
if len(ci)>1:
ci = item['s_href'].split('-')[1]
next_part_url = next_part_url_temp.format(ci)
else:
name = item['s_cate']
next_part_url_temp_d = "https://search.suning.com/emall/searchProductList.do?keyword={}&ci=0&pg=01&cp=3&il=0&st=0&iy=0&adNumber=0&n=1&ch=4&sesab=ACAABAABCCAA&id=IDENTIFYING&cc=394&paging=1&sub=0"
next_part_url = next_part_url_temp_d.format(name)
yield scrapy.Request(
next_part_url,
callback=self.parse_book_list,
meta={"item": deepcopy(item)}
)
def parse_book_list(self, response):
item = response.meta["item"]
li_list = response.xpath("//li[contains(@class,'product')]")
# print(len(li_list))
# print("*"*100)
for li in li_list:
# 书名
item['book_name'] = li.xpath(".//p[@class='sell-point']/a/text()").get().strip()
# 书链接地址
item['book_href'] = li.xpath(".//p[@class='sell-point']/a/@href").get()
# 书商店名
item['book_store_name'] = li.xpath(".//p[contains(@class,'seller oh no-more')]/a/text()").get()
# print(item)
yield response.follow(
item['book_href'],
callback=self.parse_book_detail,
meta={'item': deepcopy(item)}
)
next_url_1 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=394"
next_url_2 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=394&paging=1&sub=0"
next_url_1_d = "https://search.suning.com/emall/searchProductList.do?keyword={}&ci={}&pg=01&cp=1&il=0&st=0&iy=0&adNumber=0&n=1&ch=4&sesab=ACAABAABCCAA&id=IDENTIFYING&cc=394"
next_url_2_d = "https://search.suning.com/emall/searchProductList.do?keyword={}&ci={}&pg=01&cp=1&il=0&st=0&iy=0&adNumber=0&n=1&ch=4&sesab=ACAABAABCCAA&id=IDENTIFYING&cc=394"
current_Page = re.findall('param.currentPage = "(.*?)";', response.body.decode())[0]
total_Page = re.findall('param.pageNumbers = "(.*?)";', response.body.decode())[0]
if int(current_Page) < int(total_Page):
next_page_num = int(current_Page) + 1
ci = item['s_href'].split('-')
if len(ci) > 1:
ci = item['s_href'].split('-')[1]
next_url_1 = next_url_1.format(ci, next_page_num)
next_url_2 = next_url_2.format(ci, next_page_num)
else:
name = item['s_cate']
next_url_1 = next_url_1_d.format(name, next_page_num)
next_url_2 = next_url_2_d.format(name, next_page_num)
yield scrapy.Request(
next_url_1,
callback=self.parse_book_list,
meta={"item": item}
)
# print("*"*100)
next_url_2 = next_url_2.format(ci, next_page_num)
yield scrapy.Request(
next_url_2,
callback=self.parse_book_list,
meta={"item": item}
)
# print("?"*100)
def parse_book_detail(self, response):
item = response.meta['item']
price_temp_url = "https://pas.suning.com/nspcsale_0_0000000{}_0000000{}_{}_180_394_3940199_502282_1000118_9118_10920_Z001___{}_{}________0___0.0_2__502320_502687_.html"
p1 = response.url.split('/')[-1].split('.')[0]
p3 = response.url.split('/')[-2]
p4 = re.findall('"catenIds":"(.*?)"', response.body.decode())
if len(p4)>0:
p4 = p4[0]
p5 = re.findall('"weight":"(.*?)"', response.body.decode())[0]
price_url = price_temp_url.format(p1, p1, p3, p4, p5)
yield scrapy.Request(
price_url,
callback=self.parse_book_price,
meta={'item': item}
)
def parse_book_price(self, response):
item = response.meta['item']
item['book_price'] = re.findall('"netPrice":"(.*?)"', response.body.decode())[0]
yield item