【python爬虫】爬取唯品会商品信息

唯品会商品信息采集步骤:

  1. 获取品牌ID和品牌名称;
  2. 获取当前品牌商品列表的总页数;
  3. 获取每页商品列表中商品的信息。

一、获取品牌ID和品牌名称

    def get_task(self,task_list=None):
        '''
        获取任务
        :return:
        '''
        try:
            print("商品类型列表:",task_list)
            for task_ in task_list:
                key_dit = {
                    "keyword":task_
                }
                url_str = urlencode(key_dit)
                start_api = f'''https://mapi-rp.vip.com/vips-mobile/rest/shop/search/brand_store/get/v3?app_name=shop_wap&app_version=4.0&api_key=8cec5243ade04ed3a02c5972bcda0d3f&mobile_platform=2&source_app=yd_wap&warehouse=VIP_NH&fdc_area_id=104104101&province_id=104104&mars_cid=1584322664117_812f182347fe5848add8d04b91257af6&mobile_channel=mobiles-adp%3Ag1o71nr0%3A%3A%3A%3A%7C%7C&standby_id=nature&channel_id=1&isAZSort=1&gPlatform=WAP&mvip=true&_=1599117093&{url_str}'''
                print(task_," ",start_api)
                task_resp = self.sc.get_html(start_api)
                task_json = task_resp.json()
                if task_json['code']==1:
                    brand_list = task_json['data']['list']
                    # print(brand_list)
                    task_list = []
                    for brand_dict in brand_list:
                        #品牌ID
                        brand_id = int(brand_dict['id'])
                        #品牌名
                        brand_name = brand_dict['name']
                        #插入时间
                        add_time = datetime.datetime.now()
                        task_list.append((task_,brand_id,brand_name,add_time,1))
                    if len(task_list):
                        sql = f'''insert into {self.task_tbl}(goods_type,brand_id,brand_name,add_time,is_state)
                        values(%s,%s,%s,%s,%s)'''
                        print("当前任务:",task_list)
                        self.sc.store_data(sql,data_list=task_list)
        except:
            self.sc.collect_error()

二、获取当前品牌商品列表的总页数

    def get_totalpage(self,id,brand_id,goods_type):
        '''
        提取总页码数
        '''
        try:
            key_dit = {
                "keyword": goods_type
            }
            url_str = urlencode(key_dit)
            total_api = f'''https://mapi-rp.vip.com/vips-mobile/rest/shopping/search/product/rank?app_name=shop_wap&app_version=4.0&api_key=8cec5243ade04ed3a02c5972bcda0d3f&mobile_platform=2&source_app=yd_wap&warehouse=VIP_NH&fdc_area_id=104104101&province_id=104104&mars_cid=1584322664117_812f182347fe5848add8d04b91257af6&mobile_channel=mobiles-adp%3Ag1o71nr0%3A%3A%3A%3A%7C%7C&standby_id=nature&{url_str}&brandStoreSns={brand_id}&sort=0&pageOffset=0&channelId=1&wapConsumer=A1&gPlatform=WAP&functions=bsBrands%2CfavNumLabel%2CtotalLabel&mvip=true&_=1599122080'''
            print("任务ID:",id,"商品类型:",goods_type,"品牌ID:",brand_id,"获取总页数:")
            print(total_api)
            html = self.sc.get_html(total_api)
            if not html:
                return 0
            resp = html.json()
            total_num = int(resp['data']['total'])
            page_offset = int(resp['data']['batchSize'])
            up_sql = f'''update {self.task_tbl} set total_num={total_num} where id={id}'''
            print("更新总商品:",up_sql)
            self.sc.store_data(up_sql)
            return (total_num,page_offset)
        except:
            self.sc.collect_error()

三、获取每页商品列表中商品的信息

    def get_products(self,data_tuple):
        try:
            '''提取商品列表'''
            id,goods_type,brand_id,brand_name,total_num,add_time,is_state = data_tuple
            total_num,page_offset = self.get_totalpage(id,brand_id,goods_type)
            time.sleep(3.2)
            for i in range(0,total_num,page_offset):
                key_dit = {
                    "keyword":goods_type
                }
                url_str = urlencode(key_dit)
                brand_api = f'''https://mapi-rp.vip.com/vips-mobile/rest/shopping/search/product/rank?app_name=shop_wap&app_version=4.0&api_key=8cec5243ade04ed3a02c5972bcda0d3f&mobile_platform=2&source_app=yd_wap&warehouse=VIP_NH&fdc_area_id=104104101&province_id=104104&mars_cid=1584322664117_812f182347fe5848add8d04b91257af6&mobile_channel=mobiles-adp%3Ag1o71nr0%3A%3A%3A%3A%7C%7C&standby_id=nature&{url_str}&brandStoreSns={brand_id}&sort=0&pageOffset={i}&channelId=1&wapConsumer=A1&gPlatform=WAP&functions=bsBrands%2CfavNumLabel%2CtotalLabel&mvip=true&_=1599122080'''
                print(f"获取{i}商品",brand_api)
                html = self.sc.get_html(brand_api)
                if not html:
                    print("获取商品ID页面失败")
                    continue
                resp_json = html.json()
                products_list = resp_json['data']['products']

                if len(products_list):
                    insert_list = []
                    for products_dict in products_list:
                        #商品ID
                        goods_id = int(products_dict['pid'])
                        # 插入时间
                        add_time = datetime.datetime.now()
                        insert_list.append((goods_type,brand_id,brand_name,goods_id,add_time,1))
                    insert_sql = f'''insert ignore into {self.data_tbl}(goods_type,brand_id,brand_name,
                    goods_id,add_time,is_state) values (%s,%s,%s,%s,%s,%s)'''
                    print("数据插入","*"*50)
                    self.sc.store_data(insert_sql,data_list=insert_list)
                    time.sleep(random.uniform(1.7,4.2))

            up_sql = f'''update {self.task_tbl} set is_state=0 where id={id}'''
            print(f"{brand_id}——{brand_name}爬完商品ID",up_sql)
            self.sc.store_data(up_sql)
        except:
            self.sc.collect_error()

以上就是我的分享,如果有什么不足之处请指出,多交流,谢谢!

想获取更多数据或定制爬虫的请私信我。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

秋无之地

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值