#! usr/bin/env python
# -*- coding:utf-8 -*-
__author__="JUNHAN"
环境: Python3.6.5
1.导入第三方库
import functools
import execjs
import traceback
from urllib.parse import quote_plus
import requests, json, time, datetime, random, re
from urllib.parse import quote
from user_check_proxy import Proxy_start
from logs import logDebug, logInfo
#代理自己加上,或者不加代理
from user_check_proxy import get_proxy2
#过客网支持淘宝、天猫、京东、苏宁、当当、网易考拉、亚马逊等商品网址
import warnings
warnings.filterwarnings('ignore')
2.手机端UA
def random_h5_ua():
h5_user_agent = ['Mozilla/5.0 (Linux; Android 5.1; OPPO A37m Build/LMY47I; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/4G Language/zh_CN', 'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/4G Language/zh_CN', 'Mozilla/5.0 (Linux; Android 5.1.1; OPPO R9 Plusm A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/WIFI Language/zh_CN', 'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/WIFI Language/zh_CN', 'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R11 Pluskt Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/WIFI Language/zh_CN']
return random.choice(h5_user_agent)
3.PC端UA
def random_web_ua():
web_user_agent = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
]
return random.choice(web_user_agent)
4.先获取30天时间的方法
获取当前日期前30天日期, 不算当天, 返回 list
def days_ago():
today = time.strftime('%Y,%m,%d') # <class 'str'>
t = time.strptime(today, '%Y,%m,%d') # # <class 'time.struct_time'>
y, m, d = t[0:3]
# print(y, m, d)
thirty_days_list = []
# print("thirty_days_list:",thirty_days_list)
for dd in range(30, 0, -1):
Date = str(datetime.datetime(y, m, d) - datetime.timedelta(dd)).split()
days_b = Date[0] # .replace('-', '') # <class 'str'>
# print(days_b)
# if days_b[1][0] == '0':
# days_b[1] = days_b[1][1]
# if days_b[2][0] == '0':
# days_b[2] = days_b[2][1]
# days_before = '-'.join(days_b)
# print('--', days_before)
thirty_days_list.append(days_b)
return thirty_days_list
5.时间戳转换
def get_timestamp_str(timestamp):
# print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(timestamp)))
# print(type(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(timestamp))))
return time.strftime('%Y-%m-%d',time.localtime(timestamp))
def get_guoke_price_web(item_url):
# 获取代理,这里需要自己加上代理池或者云代理!!!!!!!!
下面一行,可以注释代理不用,请求的时候(proxies=proxies)删除!!!!!!
proxies = get_proxy2()
ua = random_web_ua()
k = quote_plus(item_url)
btnSearch = quote_plus('搜索')
6.开始请求url
url_01 = 'http://www.tool168.cn/?'
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
# 'Cookie':'PHPSESSID=l31o4o91itpmeh7m38ol196t47; Hm_lvt_61e842dc51946642fa309fd4e1c752aa=1547202812; Hm_lpvt_61e842dc51946642fa309fd4e1c752aa=1547283438',
'Host': 'www.tool168.cn',
'Referer': 'http://www.tool168.cn/history/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ua,
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
}
params = {
'm': 'history',
'a': 'view',
'k': k,
'btnSearch': btnSearch
}
response_html_01 = requests.get(url=url_01, headers=header, params=params ,proxies=proxies, verify=False,timeout=20)
result_html_01 = response_html_01.text
# print(result_html_01)
# print(result)
# print("result_html_01:",result_html_01)
checkCode = re.search('id="checkCodeId" value="(.*?)"', result_html_01).group(1)
# print(checkCode)
url_02 = "http://www.tool168.cn/dm/ptinfo.php"
header = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '108',
'Content-Type': 'application/x-www-form-urlencoded',
# 'Cookie':'PHPSESSID=l31o4o91itpmeh7m38ol196t47; Hm_lvt_61e842dc51946642fa309fd4e1c752aa=1547202812; Hm_lpvt_61e842dc51946642fa309fd4e1c752aa=1547210101',
'Host': 'www.tool168.cn',
'Origin': 'http://www.tool168.cn',
'Referer': 'http://www.tool168.cn/?m=history&a=view&k={}&btnSearch={}'.format(k,btnSearch),
# 'Referer': f'http://www.tool168.cn/?m=history&a=view&k={k}&btnSearch={btnSearch}',
'User-Agent': ua,
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
data = {
# 'checkCode': "ce5e75b10ad46b1927895e0de48b5134",
'checkCode':checkCode,
'con': item_url,
# 'con': 'https://detail.tmall.com/item.htm?id=534068049215'
}
response_html_02 = requests.post(url=url_02, headers=header, data=data, proxies=proxies, verify=False,timeout=20)
result_html_02 = response_html_02.text
# print(result_html_02)
code = json.loads(result_html_02).get("code")
# print(code)
# url_03 = f"http://www.tool168.cn/dm/history.php?code={code}&t="
url_03="http://www.tool168.cn/dm/history.php?"
header = {
'Accept': 'text/plain, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Connection': 'keep-alive',
# 'Content-Length': '0',
# 'Cookie':'PHPSESSID=l31o4o91itpmeh7m38ol196t47; Hm_lvt_61e842dc51946642fa309fd4e1c752aa=1547202812; Hm_lpvt_61e842dc51946642fa309fd4e1c752aa=1547203682',
'Host': 'www.tool168.cn',
'Origin': 'http://www.tool168.cn',
'Referer':'http://www.tool168.cn/?m=history&a=view&k={}'.format(item_url),
# 'Referer': 'http://www.tool168.cn/?m=history&a=view&k=https%3A%2F%2Fdetail.tmall.com%2Fitem.htm%3Fid%3D534068049217',
'User-Agent':ua,
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
params = {
"code":code,
# 'code': "0f72c0c84e6f722de6fb57f9feb3691e26545bc2991ffc290ed35271bb85549977d831788ac687b919d2670d35df4641b9ccc7be6e917dfc",
't': ''
}
response_html_03 = requests.post(url=url_03, headers=header, params=params, proxies=proxies, verify=False,timeout=20)
# result_response = response_html_03.text
# print(result_response)
response_html_03.encoding = "utf-8"
result_response =response_html_03.text.strip()
# print('result_response = ', result_response)
try:
if "对不起,没有找到。" in result_response:
result = "对不起,该商品未收录或加载异常!"
# return result
return None
else:
return result_response
except Exception as e:
# print(e)
return None
7.解析日期,历史价格
def parse(result_history_price):
thirty_date = days_ago()[0]
# 历史价格列表
history_price_list = []
for res in result_history_price:
dates = re.search('\((.*?)\)', res).group(1)
price = re.search('\),(.*?)]', res).group(1)
dates_prices = dates.split(",")
year = dates_prices[0]
month = dates_prices[1]
month = int(month) + 1
if len(str(month)) == 1:
month = '0' + str(month)
day = dates_prices[2]
if len(day) == 1:
day = '0' + day
shop_history_time = f"{year}-{month}-{day}"
end_price = price
history_price_list.append([shop_history_time, end_price])
# print('result_list_true = ', history_price_list)
# history_price[shop_history_time] = end_price
# result_response = json.dumps(history_price)
8.判断取出30天商品历史价格
thirty_days_price = [] # 删选出最近30天价格列表
for i in history_price_list:
if int(i[0].replace('-', '')) >= int(thirty_date.replace('-', '')):
thirty_days_price.append(i)
# print('thirty_days_price = ', thirty_days_price)
if thirty_days_price == []: # 没有最近一个月日期, 说明价格和几个月前价格一致
thirty_days_price = [[thirty_date, history_price_list[-1][1]]]
try: # 查询第一天日期
if history_price_list != [] and thirty_days_price != []:
if len(history_price_list) > len(thirty_days_price):
if int(thirty_date.replace('-', '')) not in [int(i[0].replace('-', '')) for i in thirty_days_price]:
h_days = [int(i[0].replace('-', '')) for i in history_price_list]
for i in range(0, len(h_days)):
if h_days[i] < int(thirty_date.replace('-', '')) < h_days[i + 1]:
p_index = i
break
thirty_days_price.insert(0, [thirty_date, history_price_list[p_index][1]])
except: # 否则,说明慢慢买也是在这个日期第一次收录进来的
pass
# print('thirty_days_price = ', thirty_days_price)
thirty_days_price_dict = {} # 接口最终返回
for price in thirty_days_price:
thirty_days_price_dict[price[0]] = int(float(price[1]) * 100)
# print('thirty_days_price_dict = ', thirty_days_price_dict)
return thirty_days_price_dict
def gkw_history_prices(item_url):
try:
result = get_guoke_price_web(item_url)
except:
# print(item_url,'--response_erro')
return None
# print("result:",result)
try:
result_history_price = re.search('chart\("(.*?)".*\);', result, re.S).group(1).replace("],[", "],[").replace("Date.UTC", "").split(",")
thirty_days_price_dict = parse(result_history_price)
# print(item_url, '--', thirty_days_price_dict)
return thirty_days_price_dict
except:
# print(item_url, '--parse_erro')
return None
if __name__ == '__main__':
# 添加各大平台商品URL
item_url="https://item.jd.com/5475614.html"
print(gkw_history_prices(item_url))