电商跟卖,最重要是了解哪些商品可以卖, 哪些商品不能卖, 为了更好了解商品信息,我们会经常爬取商品类目的信息.
需求
亚马逊类目信息链接爬虫
- 打开亚马逊类目信息地址
https://www.amazon.com/gp/new-releases/automotive/ref=zg_bsnr_nav_automotive_0
一直递归下去,获取到亚马逊最细的分类top链接
保存最底下类目到excel
实现
from platforms.base_platform import ObjectPlatform
from util.xpath_operation import SeleniumOperation
from restful.rest_request import HttpRequests
import os
from platforms.amazon.amazon_anti_scrape import AmazonAntiScrape
from platforms.amazon.amazon_product_url import AmazonProductUrl
import copy
from util.pd_util import PandasUtil
import pandas as pd
class AmazonProductPlatform(ObjectPlatform):
name = "amazon_class_url"
desc = "亚马逊商品类目url生成和获取,获取到叶子结点,最后保存到excel"
config_file = "%s%s" % (name, "_config")
setting_file = "%s%s" % (name, "_setting")
def __init__(self, config_file_input=None, setting_file_input=None, log=None):
super(AmazonProductPlatform, self).__init__(load_extension=False, login=False)
self.config_file = config_file_input if config_file_input else \
"%s%s" % (self.base_config_package, self.config_file)
self.setting_file = setting_file_input if setting_file_input else \
"%s%s" % (self.base_setting_package, self.setting_file)
self.config_package = __import__(self.config_file, fromlist=True)
self.setting_package = __import__(self.setting_file, fromlist=True)
self.log = log
self.restful = HttpRequests()
if not os.path.exists(self.config_package.amazon_config["out_path"]):
os.makedirs(self.config_package.amazon_config["out_path"])
SeleniumOperation.log = log
self.amazon_url_obj = AmazonProductUrl(self.driver, self.config_package, self.setting_package, log)
self.amazon_anti = AmazonAntiScrape(self.driver, self.config_package, self.setting_package, log)
def before_run(self):
print("启动%s平台" % AmazonProductPlatform.desc)
def run(self):
urls = self.config_package.amazon_config["urls"]
for i, url in enumerate(urls):
# 打开亚马逊页面
print("打开页面:", url)
self.get_url_ignore_exception(url)
self.amazon_anti.detect_all_solution(url)
class_name = self.amazon_url_obj.get_class_name()
parents = []
leaf_datas = []
self.recursion_get_urls(parents, leaf_datas, url)
# 类别存储到服务器中
# 插入数据到数据库,通过restful屏蔽基础设施
if leaf_datas:
import datetime
out_file_name = os.path.join(self.config_package.amazon_config["out_path"],
str(datetime.date.today()) + "_" + class_name + ".xlsx")
leaf_datas = pd.DataFrame.from_records(leaf_datas)
self.report_class(leaf_datas["class_arrays"].to_numpy())
leaf_datas = PandasUtil.fill_pandas_col(leaf_datas, "class_arrays", "class")
PandasUtil.write_excel(leaf_datas, out_file_name)
print("保存文件{}成功".format(out_file_name))
def report_class(self, leaf_datas):
count = self.config_package.amazon_config["report_count"]
begin = 0
end = count
while begin <= len(leaf_datas):
print("正在上报类别信息切片:%s-%s,总共%s条到服务器" % (begin, end, len(leaf_datas)))
code, http_result = self.restful.rest_post_method(self.config_package.amazon_config["data_save_url"],
leaf_datas[begin:end].tolist())
if code == 0:
response = http_result.read()
response = response.decode(encoding='utf-8')
import json
response = json.loads(response)
if response["code"] == "0":
begin = end
end = end + count
continue
self.log.error("restful请求失败,code:{},reason:{}".format(code, http_result))
print(
"上报数据失败code:%s,原因:%s,继续重试,如果重试太久,请联系开发者看一下......" % (code, http_result))
import time
time.sleep(10)
pass
def recursion_get_urls(self, parents, leaf_datas, url):
leaf_class_name = SeleniumOperation.get_text(self.driver, self.setting_package.group_class_name)
if leaf_class_name:
leaf_class_name = leaf_class_name.strip()
leaf_class_names = copy.deepcopy(parents)
leaf_class_names.append(leaf_class_name)
leaf_url = self.driver.current_url
leaf_datas.append({
"class_name": "/".join(leaf_class_names),
"class_arrays": leaf_class_names,
"leaf_url": leaf_url
})
print("判断url:{}是叶子结点,路径是:{}".format(leaf_url, "/".join(leaf_class_names)))
return
maybe_parent_class_name = self.amazon_url_obj.get_class_name()
while not maybe_parent_class_name:
# todo maybe stackoverflow
info = "url:{},找不到父类名称.......,刷新页面...".format(url)
print(info)
self.log.warn(info)
self.get_url_ignore_exception(url)
self.amazon_anti.detect_all_solution(url)
maybe_parent_class_name = self.amazon_url_obj.get_class_name()
# self.recursion_get_urls(parents, leaf_datas, url) # 递归获取
maybe_parent_class_name = maybe_parent_class_name.strip()
urls = self.amazon_url_obj.get_class_urls()
parents.append(maybe_parent_class_name)
try:
for temp_url in urls:
# 打开url
#
self.get_url_ignore_exception(temp_url)
self.amazon_anti.detect_all_solution(temp_url)
self.recursion_get_urls(parents, leaf_datas, temp_url) # 递归获取
except Exception as e:
self.log.exception(e)
finally:
print("删除前的parents", "-".join(parents))
parents.pop()
print("删除后的parents", "-".join(parents))
def after_run(self):
print("%s 平台已经运行完成,请根据log目录查看运行日志\n" % AmazonProductPlatform.name)
super(AmazonProductPlatform, self).after_run()