2)速卖通商品信息采集(进阶)

最近好多小伙伴私信我要代码,因为是去年完成的,某通更改了部分结构,导致代码失效,最近花了一点时间,更新了代码!

回顾前景,已经说明某通的数据是js渲染过的,但是也存放在网页源码中,转换成了json格式,我们需要通过正则来提取。
两种思路:

1)selenium;对速度没要求,想方便一点,推荐,直接使用xpath提取数据,但是速度较慢,不适用于爬取量过大
2)request;速度快,爬取量大,就是提取数据的时候比较麻烦,要分析结构
3)逆向就不推荐了,需要懂js,难度较高,有想法可以试着本地加载js渲染数据

  • 读取链接
def read_url():
    with open('url.txt', 'r', encoding='UTF-8') as f:  # 读取url文本
        file = f.readlines()

    data = []
    for i in range(len(file)):
        value = {"total": len(file), "index": i, "value": file[i]}
        data.append(value)

    return data
  • 请求网页(requests)
def get_html(addr_map):
    headers = {
        'User-agent': UserAgent().random,
        'Cookie':'ali_apache_id=33.50.162.23.1638768571497.305192.2; xman_f=PB+/WUW+7p99qf93WsvlgzlVbukHrCZcEDj0sZEPgwZaYUqchboZlsPaw28L1b4ZgCluYmNEUX+eN/ReyOUk4d7TWLmDrhCtnEkjUv0CdIfEgNim3RJvLg==; cna=VGE0GluNuAICAXd7IplItu6D; _fbp=fb.1.1646474001849.1574633962; _gcl_au=1.1.1066639008.1646474002; xman_t=rVenJ2rmNOEV5R56UUvR/uNqG9rNpJ/eM/3yVCjaGSBHv8IwjUXCEjC13GevhGtJ; account_v=1; XSRF-TOKEN=d842951d-f332-4e0a-9e67-531672d62101; acs_usuc_t=x_csrf=lfwhorq7aehg&acs_rt=f45123692e274fb6941134efaaa6864d; ali_apache_track=; ali_apache_tracktmp=; _bl_uid=LmlCF2XL0jd8vzedF1zCeg719I0s; intl_locale=fr_FR; aep_usuc_f=site=fra&province=&city=&c_tp=EUR&ups_d=1|1|1|1&ups_u_t=1665296464087&region=FR&b_locale=fr_FR&ae_u_p_s=2; _mle_tmp_enc0=Ey%2Fp8LswzxA3J47VsqxI%2B7%2FX5zfHsbjG5rixBvMobRtIo20TFSI87DZCIt0bQG0CUAftZb8p2osmppBPU3AhKboEPcPfFZVi%2BhQ7q37GJt%2BWgxK6Ou7Rrz6B8s3VCnmV; xman_us_f=x_locale=fr_FR&x_l=0&x_c_chg=0&x_as_i=%7B%22cookieCacheEffectTime%22%3A1650335012234%2C%22isCookieCache%22%3A%22Y%22%2C%22ms%22%3A%220%22%7D&acs_rt=6e6397a6dc0845089758e2830f6b7560; xlly_s=1; _gid=GA1.2.211531649.1650875286; _m_h5_tk=5e0f15c71147ed011c5c8eb8fedc8623_1650879697974; _m_h5_tk_enc=e664897138fa3b7cc4a1fa57d3ef8029; aep_history=keywords%5E%0Akeywords%09%0A%0Aproduct_selloffer%5E%0Aproduct_selloffer%0932952011209%094001354377511%0932952011209%091005004111444866%091005004146540365%091005002059594364%0933013223561%091005001843605941; JSESSIONID=145A322AFD9AB8BE0DA2F3A2343347E8; intl_common_forever=IRRci8dLj0TLqATBPMRTXPN3MdTD6O+7Heuhnug0no3deUKGitDlAQ==; tfstk=cOeRBO9CYZblFJiYb7C0YkHsTP2dZkbKl3gHpJ3xmYvfHqvdiXygB-NNNDgZyiC..; l=eBQNuxwRgnUoc8SQmOfwourza77OSIRAguPzaNbMiOCPOS1p55HPB6qrTNY9C3GVh62HR3Jfz-p8BeYBqI0tiitwgdTndvMmn; isg=BBQUx7m0CqFCt52mwVgJqgbm5VKGbThX1vbzWK71oB8imbTj1n0I58obmZEBYXCv; _gat=1; _ga_VED1YSGNC7=GS1.1.1650875286.36.1.1650878519.0; _ga=GA1.1.2143085281.1646474002'
    }
     # 请求headers头
    try:
        address = addr_map['value']
        address = re.findall('(.*?)\n', address)[0]  # 提取url
        res = requests.get(address, headers=headers).content.decode('utf8')

        print('已抓取到' + address)
        return res
    except Exception as e:
        print(e)
  • 提取数据(正则表达式)
def process_htmls(htmls):
    result_list = [] #创建一个列表,保存数据
    for html in htmls:  # 遍历抓取每一条链接
        source = html
        state = re.findall('data: (.*)', source)    # 用正则定位抓取所有数据
        if 'Page Not Found - Aliexpress.com' in source:  # 做一个判断,如果是坏链接,则终止循环
            continue
        states = re.sub(r'[\\"]', "", state[0])  # 剔除特殊符号
        property = re.findall('skuPropertyValues:(.*)',states)  # 存放图片
        name = re.findall('propertyValueDefinitionName:(.*?),',property[0])
        img = re.findall('skuPropertyImagePath:(.*?),', property[0])  # 主图
        value = re.findall('propertyValueId:(.*?),', property[0]) # skuID
        imgs = [dict(name=i, img=j) for i, j in zip(name, img)]  # 图片+sku字典
        sku_value = re.findall('skuPriceList:(.*)',states)
        sku = re.findall('skuAttr:(.*?),',sku_value[0])
        amount = re.findall('skuActivityAmount:(.*?)}', sku_value[0])  # 价格信息
        rating_value = re.findall("averageStar:(.+),aver", states)[0]   # 评分
        rating_num = re.findall("totalValidNum:(.+),trial", states)[0]  # 评论数
        order_count = re.findall("tradeCount:(.+),trade", states)[0]    # 订单
        sku_st = set([i + '+' + a for i, a in zip(sku, amount)])
        title = re.findall("subject:(.*?),ti", states)[0]  # 标题
        product_id = re.findall('productId:(.+),root', states)[0]  # 产品ID
        url = 'https://fr.aliexpress.com/item/' + product_id + '.html'  # 链接
        attrName = re.findall('attrName:(.*?),', states)    # 描述名
        attrValue = re.findall('attrValue:(.*?),', states)  # 描述值
        description = [i + ':' + j for i, j in zip(attrName, attrValue)]  # 合并描述
        pro_description = ','.join(description)
        for ss in sku_st:
            sku = re.findall('#(.*?)\+',ss)[0]  # sku
            price = re.findall('value:(.*)', ss)[0]  # 价格
            for i in imgs:
                if sku in i['name']:
                    img = i['img']  # 图片
                    result = {}
                    result['产品标题'] = title
                    result['标准描述'] = pro_description
                    result['主图'] = img
                    result['评分'] = rating_value
                    result['评论数'] = rating_num
                    result['订单'] = order_count
                    result['产品链接'] = url
                    result['价格'] = price
                    result['SKU'] = sku
                    result_list.append(result)  # 把所有数据添加列表中
                    # print(result_list)
    return result_list
  • 保存数据(csv)
with open('test.csv', 'wt', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f,
                            fieldnames=['产品标题','标准描述','主图','评分','评论数','订单','产品链接','价格','SKU'])
    writer.writeheader()
    writer.writerows(results)

成果:
在这里插入图片描述

selenium就不讲解了,比较简单,xpath可以直接调用,没啥难点。有任何不懂可以私信。

完整项目已上传到资源,有需要的朋友可自行下载👇👇👇

速卖通采集完整源码

各位的支持和认可就是我最大的动力!

  • 3
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值