实测每个关键词查询速度为0.08s左右,远快于某些一年上千的后台插件
去国际站后台的排名查询页面,F12,获取cookies和token填入
https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm
import requests
from lxml import etree
import time
import csv
from multiprocessing import Pool
def get_words():
with open('key.txt', 'r', encoding='utf-8') as f:#需要查询的单词保存到key.txt文件,每行一个单词
words = []
for word in f.readlines():
word = (word.strip())
words.append(word)
#words = words[13:]
num = len(words)
print('一共有%s个单词,需要查询'%num)
return words,num
def get_result(word,url):
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6',
'origin': 'https://hz-productposting.alibaba.com',
'referer': 'https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'content-type': 'application/x-www-form-urlencoded',
'cache-control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'cookie':'自己的cookies'#突然无法正常运行的时候需要更换
}
data = {
'_csrf_token_': '自己的token',#突然无法正常运行的时候需要更换
'queryString': word,
}
res = requests.post(url=url,headers =headers,data = data)
source = res.text
return source
def parse_source(source,file_path,sk):
html = etree.HTML(source)
try:
tbody = html.xpath("//tbody")[0]
except:
line = [sk, '页面解析错误!检查cookies或稍后重试。']
print(line)
save_file(line, file_path)
return None
trs = tbody.xpath('./tr')[:1]
result = html.xpath("//div[@class='search-result']/text()")[1].strip()[1:-4]#控制每次读取几个产品
for tr in trs:
products = tr.xpath("td[@class='products']/a/text()")
if len(products) == 0:
print(sk,"无匹配产品")
line = [sk,'无匹配产品']
save_file(line, file_path)
else:
ranking = tr.xpath("td[@class='ranking']/a/text()")
charge = tr.xpath("td[@class='charge']/span/text()")
if charge == []:
line = [sk,products[0],ranking[0],result]
print(sk,ranking[0])
else:
line = [sk,products[0],ranking[0],result,charge[0]]
print(sk,ranking[0],charge[0])
save_file(line,file_path)
def save_file(line,file_path):
print(line)
csvfile = open('%s.csv' % file_path, 'a+',newline="")
writer = csv.writer(csvfile, dialect='excel')
writer.writerow(line)
def run(word, num, a, total_start):
file_path = '结果保存的文件名'
url = 'https://hz-productposting.alibaba.com/product/ranksearch/rankSearch.htm'
time_start = time.time()
print('开始查询%s的排名!---------%s/%s'%(word,a,num))
source = get_result(word,url)
sk = word
parse_source(source,file_path,sk)
print('保存成功!')
time_now = time.time() - total_start
time_end = time.time()
time_total = time_end - time_start
print('总花费时间%.2f秒,平均花费%.2f秒'%(time_now,(time_now/a)))
print('*'*40)
if __name__ == '__main__':
total_start = time.time()
words, num = get_words()
po = Pool(16)#16进程数,调太高后会弹验证码,解决方法:需要休息一段时间重试
a = 1
for word in words:
try:
po.apply_async(run, args=(word, num, a, total_start))
a = a + 1
except:
print('进程报错!')
po.close()
po.join()
total_end = time.time()
total_total = total_end - total_start
print('总用时%.2f秒,平均用时%.2f秒' % (total_total, total_total / num))