我在上篇博客记录了批量抓取接口数据的一些解析,详情:https://blog.csdn.net/s_kangkang_A/article/details/116267728
还分析了从 https://log.mmstat.com/eg.js 获取的utid(cna)不可用并做了测试
也提出来解决办法,使用隧道代理和selenium+chromedriver的方式获取utid(cna)做一个支撑抓取的数据库表
但是,从 https://log.mmstat.com/eg.js 获取的utid(cna)真的不可用吗
在发表上篇博客之后,我偶然运行了一次脚本,用的正是那个从 https://log.mmstat.com/eg.js 获取的,不可用的,测试了拿不到数据的utid(cna)
但结果却发生了变化,数据正常返回了
这是为啥,为什么当时测试没有返回,不经意再次使用就可以了,明明代码什么的都没改变。
等等,到这里,是不是明白了什么,明明代码什么的都没变,那失败和成功之间的变量是什么?没错,是 时间 ,唯一改变的就是时间
也就是说,在一定时间后,utid(cna)变得可用了。于是我做了测试,代码如下:
import base64
import hashlib
import json
import re
import time
import requests
import urllib3
urllib3.disable_warnings()
def get_data(url, utid):
"""‘’
生成data参数
:param url: 视频地址
:return:
"""
# 从连接中获取vid
vid = url.split('id_')[-1].split('.')[0]
# 获取部分url用于生成加密参数
url_ = url.split('//')[-1].split('?')[0]
# base64 加密
emb = base64.b64encode(("809715843" + url_).encode('utf-8')).decode('utf-8')
data = {"biz_params": {"vid": vid},
"ad_params": {
"atm": "",
"aw": "w",
"bt": "pc",
"d": "0",
"dq": "auto",
"emb": emb,
"fu": "0",
"isvert": "0",
"needbf": "2",
"os": "win",
"osv": "10",
"p": "1",
"partnerid": "null",
"pver": "0.6.16",
"rst": "mp4",
"site": "1",
"sver": "1.1",
"vip": "0",
"vs": "1",
"wintype": "interior"
},
"steal_params": {
"ccode": "0502",
"client_ip": "192.168.1.1",
"client_ts": str(round(time.time() * 1000)),
"utid": utid,
"version": "0.6.16",
"ckey": "DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND"
}
}
data_ = dict()
# 将参数json化
for key, value in data.items():
data_[key] = json.dumps(value)
return json.dumps(data_)
def get_all_parameter(data_, token=''):
"""
获得所有参数
:param data_: data参数
:param token: token参数
:return:
"""
t = str(round(time.time() * 1000))
data = {
'jsv': '2.5.0',
'appKey': '24679788',
't': t,
'api': 'mtop.youku.play.ups.appinfo.get',
'v': '1.1',
'sign': get_sign(t, data_, token),
'timeout': '20000',
'YKPid': '20160317PLF000211',
'YKLoginRequest': 'true',
'AntiFlood': 'true',
'AntiCreep': 'true',
'type': 'jsonp',
'dataType': 'jsonp',
'callback': 'mtopjsonp1',
'data': data_
}
return data
def get_sign(t, data, token):
"""
:param t: 13位时间戳
:param data: 请求时的参数data
:param token: 从cookie中获得
:return: 加密后的sign
"""
appKey = '24679788'
sign = token + '&' + t + '&' + appKey + '&' + data
md5 = hashlib.md5()
md5.update(sign.encode('UTF-8'))
sign = md5.hexdigest()
return sign
def get_interface(url, utid, session, headers, proxies=None):
# 支持代理和无代理两种请求
"""
获取视频借口内容
:param url: 视频连接
:return:
"""
data_ = get_data(url, utid)
data = get_all_parameter(data_)
url = 'https://acs.youku.com/h5/mtop.youku.play.ups.appinfo.get/1.1/'
if not proxies:
response = session.get(url, params=data, verify=False, headers=headers)
else:
response = session.get(url, params=data, verify=False, headers=headers, proxies=proxies)
# print(response.text)
if '令牌为空' in response.text:
# 从cookie中获取token
token = response.cookies['_m_h5_tk'].split('_')[0]
print("获取token", token)
data = get_all_parameter(data_, token=token)
data['callback'] = 'mtopjsonp1'
# 请求接口
if not proxies:
response = session.get(url, params=data, verify=False, headers=headers)
else:
response = session.get(url, params=data, verify=False, headers=headers, proxies=proxies)
return response.text
def parse(url, utid):
result = ''
times = 1
headers = {
'Host': "acs.youku.com",
'Connection': "keep-alive",
'Pragma': "no-cache",
'Cache-Control': "no-cache",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.119 Safari/537.36",
'Accept': "*/*",
'Referer': url,
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9",
'cache-control': "no-cache",
}
while times:
session = requests.Session()
proxies = {}
time.sleep(5)
try:
result = get_interface(url, utid, session, headers)
except:
result = ''
if len(result) > 10000:
break
else:
times -= 1
print(result)
def get_utid(proxies=None, u=None):
t = int(time.time() * 1000)
url = 'https://log.mmstat.com/eg.js?t={}'.format(t)
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Host': 'log.mmstat.com',
'Referer': 'https://v.youku.com/',
'TE': 'Trailers',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
if u:
cookie = 'cna={}'.format(u)
headers['Cookie'] = cookie
if proxies:
resp = requests.get(url, headers=headers, proxies=proxies)
else:
resp = requests.get(url, headers=headers)
print(resp.text)
utid = re.search('Etag="(.*?)"', resp.text, re.S).group(1).strip()
return utid
if __name__ == '__main__':
utid = get_utid()
print('未等待的utid,', utid)
parse('https://v.youku.com/v_show/id_XNDA4NTM4NTQwMA==.html', utid)
utid = get_utid()
print('等待的utid,', utid)
time.sleep(60)
parse('https://v.youku.com/v_show/id_XNDA4NTM4NTQwMA==.html', utid)
上面写了一组对照,一个utid未经过sleep,一个经过sleep。结果如下:
破案了。