电商选品/跟卖| 亚马逊商品类爬取

最新推荐文章于 2024-11-10 15:01:17 发布

程序猿阿三

最新推荐文章于 2024-11-10 15:01:17 发布

阅读量357

点赞数 12

分类专栏：爬虫项目实战文章标签： python 开发语言电商亚马逊

本文链接：https://blog.csdn.net/youbingchen/article/details/142761722

版权

爬虫项目实战专栏收录该内容

10 篇文章 2 订阅 ¥19.90 ¥99.00

订阅专栏

超级会员免费看

电商跟卖,最重要是了解哪些商品可以卖, 哪些商品不能卖, 为了更好了解商品信息,我们会经常爬取商品类目的信息.

需求

亚马逊类目信息链接爬虫

打开亚马逊类目信息地址

https://www.amazon.com/gp/new-releases/automotive/ref=zg_bsnr_nav_automotive_0

一直递归下去，获取到亚马逊最细的分类top链接

保存最底下类目到excel

实现

from platforms.base_platform import ObjectPlatform
from util.xpath_operation import SeleniumOperation
from restful.rest_request import HttpRequests
import os
from platforms.amazon.amazon_anti_scrape import AmazonAntiScrape
from platforms.amazon.amazon_product_url import AmazonProductUrl
import copy
from util.pd_util import PandasUtil
import pandas as pd


class AmazonProductPlatform(ObjectPlatform):
    name = "amazon_class_url"
    desc = "亚马逊商品类目url生成和获取,获取到叶子结点,最后保存到excel"
    config_file = "%s%s" % (name, "_config")
    setting_file = "%s%s" % (name, "_setting")

    def __init__(self, config_file_input=None, setting_file_input=None, log=None):
        super(AmazonProductPlatform, self).__init__(load_extension=False, login=False)
        self.config_file = config_file_input if config_file_input else \
            "%s%s" % (self.base_config_package, self.config_file)

        self.setting_file = setting_file_input if setting_file_input else \
            "%s%s" % (self.base_setting_package, self.setting_file)
        self.config_package = __import__(self.config_file, fromlist=True)

        self.setting_package = __import__(self.setting_file, fromlist=True)
        self.log = log
        self.restful = HttpRequests()
        if not os.path.exists(self.config_package.amazon_config["out_path"]):
            os.makedirs(self.config_package.amazon_config["out_path"])
        SeleniumOperation.log = log
        self.amazon_url_obj = AmazonProductUrl(self.driver, self.config_package, self.setting_package, log)
        self.amazon_anti = AmazonAntiScrape(self.driver, self.config_package, self.setting_package, log)

    def before_run(self):
        print("启动%s平台" % AmazonProductPlatform.desc)

    def run(self):
        urls = self.config_package.amazon_config["urls"]
        for i, url in enumerate(urls):
            # 打开亚马逊页面
            print("打开页面:", url)
            self.get_url_ignore_exception(url)
            self.amazon_anti.detect_all_solution(url)
            class_name = self.amazon_url_obj.get_class_name()
            parents = []
            leaf_datas = []
            self.recursion_get_urls(parents, leaf_datas, url)
            # 类别存储到服务器中

            # 插入数据到数据库,通过restful屏蔽基础设施
            if leaf_datas:
                import datetime
                out_file_name = os.path.join(self.config_package.amazon_config["out_path"],
                                             str(datetime.date.today()) + "_" + class_name + ".xlsx")
                leaf_datas = pd.DataFrame.from_records(leaf_datas)
                self.report_class(leaf_datas["class_arrays"].to_numpy())
                leaf_datas = PandasUtil.fill_pandas_col(leaf_datas, "class_arrays", "class")
                PandasUtil.write_excel(leaf_datas, out_file_name)
                print("保存文件{}成功".format(out_file_name))


    def report_class(self, leaf_datas):

        count = self.config_package.amazon_config["report_count"]
        begin = 0
        end = count
        while begin <= len(leaf_datas):
            print("正在上报类别信息切片:%s-%s,总共%s条到服务器" % (begin, end, len(leaf_datas)))
            code, http_result = self.restful.rest_post_method(self.config_package.amazon_config["data_save_url"],
                                                              leaf_datas[begin:end].tolist())
            if code == 0:
                response = http_result.read()
                response = response.decode(encoding='utf-8')
                import json
                response = json.loads(response)
                if response["code"] == "0":
                    begin = end
                    end = end + count
                    continue
            self.log.error("restful请求失败,code:{},reason:{}".format(code, http_result))
            print(
                "上报数据失败code:%s,原因:%s,继续重试,如果重试太久,请联系开发者看一下......" % (code, http_result))
            import time
            time.sleep(10)
        pass


    def recursion_get_urls(self, parents, leaf_datas, url):
        leaf_class_name = SeleniumOperation.get_text(self.driver, self.setting_package.group_class_name)
        if leaf_class_name:
            leaf_class_name = leaf_class_name.strip()
            leaf_class_names = copy.deepcopy(parents)
            leaf_class_names.append(leaf_class_name)
            leaf_url = self.driver.current_url
            leaf_datas.append({
                "class_name": "/".join(leaf_class_names),
                "class_arrays": leaf_class_names,
                "leaf_url": leaf_url
            })
            print("判断url:{}是叶子结点,路径是:{}".format(leaf_url, "/".join(leaf_class_names)))
            return
        maybe_parent_class_name = self.amazon_url_obj.get_class_name()
        while not maybe_parent_class_name:
            # todo maybe stackoverflow
            info = "url:{},找不到父类名称.......,刷新页面...".format(url)
            print(info)
            self.log.warn(info)
            self.get_url_ignore_exception(url)
            self.amazon_anti.detect_all_solution(url)
            maybe_parent_class_name = self.amazon_url_obj.get_class_name()
            # self.recursion_get_urls(parents, leaf_datas, url)  # 递归获取
        maybe_parent_class_name = maybe_parent_class_name.strip()
        urls = self.amazon_url_obj.get_class_urls()
        parents.append(maybe_parent_class_name)
        try:
            for temp_url in urls:
                # 打开url
                #
                self.get_url_ignore_exception(temp_url)
                self.amazon_anti.detect_all_solution(temp_url)
                self.recursion_get_urls(parents, leaf_datas, temp_url)  # 递归获取
        except Exception as e:
            self.log.exception(e)
        finally:
            print("删除前的parents", "-".join(parents))
            parents.pop()
            print("删除后的parents", "-".join(parents))


    def after_run(self):
        print("%s 平台已经运行完成，请根据log目录查看运行日志\n" % AmazonProductPlatform.name)

        super(AmazonProductPlatform, self).after_run()