【干货】批量查询阿里巴巴国际站后台排名数据源码-python运营提效-(第二期)

本文链接：https://blog.csdn.net/weixin_43297217/article/details/107087269

实测每个关键词查询速度为0.08s左右，远快于某些一年上千的后台插件

去国际站后台的排名查询页面，F12，获取cookies和token填入
https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm

import requests
from lxml import etree
import time
import csv
from multiprocessing import Pool

def get_words():
    with open('key.txt', 'r', encoding='utf-8') as f:#需要查询的单词保存到key.txt文件，每行一个单词
        words = []
        for word in f.readlines():
            word = (word.strip())
            words.append(word)
        #words = words[13:]
        num = len(words)
        print('一共有%s个单词，需要查询'%num)
    return words,num

def get_result(word,url):
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6',
        'origin': 'https://hz-productposting.alibaba.com',
        'referer': 'https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'content-type': 'application/x-www-form-urlencoded',
        'cache-control': 'max-age=0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'cookie':'自己的cookies'#突然无法正常运行的时候需要更换
    }
    data = {
        '_csrf_token_': '自己的token',#突然无法正常运行的时候需要更换
        'queryString': word,

    }
    res = requests.post(url=url,headers =headers,data = data)
    source = res.text
    return source


def parse_source(source,file_path,sk):
    html = etree.HTML(source)
    try:

        tbody = html.xpath("//tbody")[0]
    except:
        line = [sk, '页面解析错误！检查cookies或稍后重试。']
        print(line)
        save_file(line, file_path)
        return None
    trs = tbody.xpath('./tr')[:1]
    result = html.xpath("//div[@class='search-result']/text()")[1].strip()[1:-4]#控制每次读取几个产品
    for tr in trs:
        products = tr.xpath("td[@class='products']/a/text()")
        if len(products) == 0:
            print(sk,"无匹配产品")
            line = [sk,'无匹配产品']
            save_file(line, file_path)
        else:
            ranking = tr.xpath("td[@class='ranking']/a/text()")
            charge = tr.xpath("td[@class='charge']/span/text()")
            if charge == []:
                line = [sk,products[0],ranking[0],result]
                print(sk,ranking[0])
            else:
                line = [sk,products[0],ranking[0],result,charge[0]]
                print(sk,ranking[0],charge[0])
            save_file(line,file_path)

def save_file(line,file_path):
    print(line)
    csvfile = open('%s.csv' % file_path, 'a+',newline="")
    writer = csv.writer(csvfile, dialect='excel')
    writer.writerow(line)

def run(word, num, a, total_start):
    file_path = '结果保存的文件名'
    url = 'https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm'
    time_start = time.time()
    print('开始查询%s的排名！---------%s/%s'%(word,a,num))
    source = get_result(word,url)
    sk = word
    parse_source(source,file_path,sk)
    print('保存成功!')
    time_now = time.time() - total_start
    time_end = time.time()
    time_total = time_end - time_start
    print('总花费时间%.2f秒，平均花费%.2f秒'%(time_now,(time_now/a)))
    print('*'*40)

if __name__ == '__main__':
    total_start = time.time()
    words, num = get_words()
    po = Pool(16)#16进程数，调太高后会弹验证码，解决方法：需要休息一段时间重试
    a = 1
    for word in words:
        try:
            po.apply_async(run, args=(word, num, a, total_start))
            a = a + 1
        except:
            print('进程报错！')
    po.close()
    po.join()
    total_end = time.time()
    total_total = total_end - total_start
    print('总用时%.2f秒，平均用时%.2f秒' % (total_total, total_total / num))