洋码头商品数据获取

方法参考scrapy异步获取

获取列表页面

class YmatouSpider(scrapy.Spider):
    name = 'ymatou'
    start_urls = [r'https://www.ymatou.com/products?k=奶粉']

    def parse(self, response):
        res=response.text
        #self.logger.info(res)
        res=Selector(text=res)
        lst=[]
        #lst=res.xpath('//div[@class="search-list"]//li[@class="product-item "]//@sproductid').get()
        for i in res.xpath('//div[@class="search-list"]//li[@class="product-item "]'):
            lst.append(i.xpath('./@sproductid').get())
        self.logger.info(f'lst{lst}')
        for k in lst:
            url=f"https://www.ymatou.hk/product/{k}.html"
            dic={}
            dic['id']=k
            yield Request(url=url,callback=self.get_price_name,cb_kwargs=dic)

换页时出现问题
在这里插入图片描述
找network里出现的接口用作翻页

import scrapy
from scrapy import Selector
from scrapy import Request
from yang.items import YangItem
import json
class YmatouSpider(scrapy.Spider):
    name = 'ymatou'
    page = 1
    start_urls = [r'https://www.ymatou.com/products/api/getProductListByCondition?keyword=奶粉&pageIndex=1']
    def parse(self, response):
        res=response.json()
        try :
            lst=res["result"]["ProductIds"]
        except KeyError:#停止增加并行页面
            return
        self.logger.info(f'lst{lst}')
        for k in lst:
            url=f"https://www.ymatou.hk/product/{k}.html"
            dic={}
            dic['id']=k
            yield Request(url=url,callback=self.get_price_name,cb_kwargs=dic)

        self.page=self.page+1
        self.logger.info(f"page{self.page}")
        url='https://www.ymatou.com/products/api/getProductListByCondition?keyword=奶粉&pageIndex={}'.format(self.page)
        yield Request(url=url, callback=self.parse)


    def  get_price_name(self,response,**kwargs):
        id=kwargs['id']
        res=response.text

        res=Selector(text=res)

        kwargs['name']=res.xpath('//div[@class="pro-property"]//h3/text()').get().strip()
        self.logger.info(f"name{kwargs['name']}")
        try:
            price=res.xpath('//div[@class="pro-property"]//span[@class="promo-price"]/text()').get().strip()
        except AttributeError :
            price=res.xpath('//div[@class="pro-property"]//span[@class="current-price"]/text()').get().strip()
        except Exception as e:
            self.logger.Error(e.args)
        kwargs['price']=price
        url = f"https://www.ymatou.hk/product/api/GetProductDescription?productid={id}"
        yield Request(url=url,callback=self.get_pic,cb_kwargs=kwargs)

    def get_pic(self,response,**kwargs):
        j = response.json()
        lst = j["Data"]["Description"]
        self.logger.info(lst)
        #获取商品参数
        fea = []
        for item in lst:
            count = 0  #记录两个字段是否取到,其他字段丢弃
            if count == 2 :
                break
            # self.logger.info(f'Title {item["Title"]} ')
            # self.logger.info( f'类型{type(item)} ')
            #item = json.loads(item)
            if item["Title"]=="商品参数":
                for kv in item["KeyValue"]:
                    k = kv["Key"]
                    v = kv["Value"]
                    intro = k + ":" + v
                    fea.append(intro)
                kwargs['fea'] = ";".join(fea)
                count=count+1
            if item["Title"] == "商品介绍":

                kwargs['pic'] =str( item["PicList"])
                count=count+1

        if 'fea' not in kwargs.keys():
            kwargs['fea']=''
        self.logger.info(kwargs)
        item=YangItem()
        item['id']=kwargs['id']
        item['pic'] = kwargs['pic']
        item['name'] = kwargs['name']
        item['intro'] = kwargs['fea']
        item['price'] = kwargs['price']
        yield item

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值