别说了,GKD,滴滴学生卡
不上注释了哦 如果那里我写的不明白就留言哈
import scrapy,re
from copy import deepcopy
class SuningSpider(scrapy.Spider):
name = 'suning'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com']
def parse(self, response):
dl_list = response.xpath('//div[@class="menu-list"]//dl')
for dl in dl_list:
item = {}
item["menu_list"] = dl.xpath('./dt/h3/a/text()').extract_first()
for dd in dl_list:
item["small_list"] = dd.xpath('./dd/a/text()').extract_first()
item["small_href"] = dd.xpath('./dd/a/@href').extract_first()
yield scrapy.Request(
item["small_href"],
callback=self.parse_smallhref,
meta={"item":deepcopy(item)}
)
def parse_smallhref(self,response):
item = response.meta["item"]
li_list = response.xpath('//div[@id="filter-results"]/ul')
for li in li_list:
item["book_name"] = li.xpath('.//div[@class="res-info"]/p[2]/a/text()').extract_first()
item["book_href"] = 'https:' + li.xpath('.//div[@class="res-info"]/p[2]/a/@href').extract_first()
yield scrapy.Request(
item["book_href"],
callback=self.parse_detail,
meta={"item":deepcopy(item)}
)
currentpage = int(re.findall('param.currentPage = "(.*?)"',response.text)[0])
pagenum = int(re.findall('param.pageNumbers = "(.*?)"',response.text)[0])
nextpage = currentpage+1
categoryId = re.findall('"categoryId": "(.*?)"',response.text)[0]
if currentpage < pagenum:
next_url = 'https://list.suning.com/1-{}-{}.html'.format(categoryId,str(nextpage))
yield scrapy.Request(
next_url,
callback=self.parse_smallhref,
meta={"item":deepcopy(item)}
)
def parse_detail(self,response):
item = response.meta["item"]
item["book_price"] = re.findall('"itemPrice":"(.*?)"',response.text)[0]
item["book_author"] = response.xpath('//ul[@class="bookcon-param clearfix"]/li[1]/span/text()').extract_first()
item["book_public"] = response.xpath('//ul[@class="bookcon-param clearfix"]/li[2]/text()').extract_first()
yield deepcopy(item)