方法参考scrapy异步获取
获取列表页面
class YmatouSpider(scrapy.Spider):
name = 'ymatou'
start_urls = [r'https://www.ymatou.com/products?k=奶粉']
def parse(self, response):
res=response.text
#self.logger.info(res)
res=Selector(text=res)
lst=[]
#lst=res.xpath('//div[@class="search-list"]//li[@class="product-item "]//@sproductid').get()
for i in res.xpath('//div[@class="search-list"]//li[@class="product-item "]'):
lst.append(i.xpath('./@sproductid').get())
self.logger.info(f'lst{lst}')
for k in lst:
url=f"https://www.ymatou.hk/product/{k}.html"
dic={}
dic['id']=k
yield Request(url=url,callback=self.get_price_name,cb_kwargs=dic)
换页时出现问题
找network里出现的接口用作翻页
import scrapy
from scrapy import Selector
from scrapy import Request
from yang.items import YangItem
import json
class YmatouSpider(scrapy.Spider):
name = 'ymatou'
page = 1
start_urls = [r'https://www.ymatou.com/products/api/getProductListByCondition?keyword=奶粉&pageIndex=1']
def parse(self, response):
res=response.json()
try :
lst=res["result"]["ProductIds"]
except KeyError:#停止增加并行页面
return
self.logger.info(f'lst{lst}')
for k in lst:
url=f"https://www.ymatou.hk/product/{k}.html"
dic={}
dic['id']=k
yield Request(url=url,callback=self.get_price_name,cb_kwargs=dic)
self.page=self.page+1
self.logger.info(f"page{self.page}")
url='https://www.ymatou.com/products/api/getProductListByCondition?keyword=奶粉&pageIndex={}'.format(self.page)
yield Request(url=url, callback=self.parse)
def get_price_name(self,response,**kwargs):
id=kwargs['id']
res=response.text
res=Selector(text=res)
kwargs['name']=res.xpath('//div[@class="pro-property"]//h3/text()').get().strip()
self.logger.info(f"name{kwargs['name']}")
try:
price=res.xpath('//div[@class="pro-property"]//span[@class="promo-price"]/text()').get().strip()
except AttributeError :
price=res.xpath('//div[@class="pro-property"]//span[@class="current-price"]/text()').get().strip()
except Exception as e:
self.logger.Error(e.args)
kwargs['price']=price
url = f"https://www.ymatou.hk/product/api/GetProductDescription?productid={id}"
yield Request(url=url,callback=self.get_pic,cb_kwargs=kwargs)
def get_pic(self,response,**kwargs):
j = response.json()
lst = j["Data"]["Description"]
self.logger.info(lst)
#获取商品参数
fea = []
for item in lst:
count = 0 #记录两个字段是否取到,其他字段丢弃
if count == 2 :
break
# self.logger.info(f'Title {item["Title"]} ')
# self.logger.info( f'类型{type(item)} ')
#item = json.loads(item)
if item["Title"]=="商品参数":
for kv in item["KeyValue"]:
k = kv["Key"]
v = kv["Value"]
intro = k + ":" + v
fea.append(intro)
kwargs['fea'] = ";".join(fea)
count=count+1
if item["Title"] == "商品介绍":
kwargs['pic'] =str( item["PicList"])
count=count+1
if 'fea' not in kwargs.keys():
kwargs['fea']=''
self.logger.info(kwargs)
item=YangItem()
item['id']=kwargs['id']
item['pic'] = kwargs['pic']
item['name'] = kwargs['name']
item['intro'] = kwargs['fea']
item['price'] = kwargs['price']
yield item