如何使utid(cna)有效,解析 https://log.mmstat.com/eg.js

我在上篇博客记录了批量抓取接口数据的一些解析,详情:https://blog.csdn.net/s_kangkang_A/article/details/116267728

还分析了从  https://log.mmstat.com/eg.js  获取的utid(cna)不可用并做了测试

也提出来解决办法,使用隧道代理和selenium+chromedriver的方式获取utid(cna)做一个支撑抓取的数据库表

但是,从 https://log.mmstat.com/eg.js 获取的utid(cna)真的不可用吗

在发表上篇博客之后,我偶然运行了一次脚本,用的正是那个从 https://log.mmstat.com/eg.js 获取的,不可用的,测试了拿不到数据的utid(cna)

但结果却发生了变化,数据正常返回了

这是为啥,为什么当时测试没有返回,不经意再次使用就可以了,明明代码什么的都没改变。

等等,到这里,是不是明白了什么,明明代码什么的都没变,那失败和成功之间的变量是什么?没错,是 时间 ,唯一改变的就是时间

也就是说,在一定时间后,utid(cna)变得可用了。于是我做了测试,代码如下:

import base64
import hashlib
import json
import re
import time
import requests
import urllib3

urllib3.disable_warnings()


def get_data(url, utid):
    """‘’
    生成data参数
    :param url: 视频地址
    :return:
    """
    # 从连接中获取vid
    vid = url.split('id_')[-1].split('.')[0]
    # 获取部分url用于生成加密参数
    url_ = url.split('//')[-1].split('?')[0]
    # base64 加密
    emb = base64.b64encode(("809715843" + url_).encode('utf-8')).decode('utf-8')
    data = {"biz_params": {"vid": vid},
            "ad_params": {
                "atm": "",
                "aw": "w",
                "bt": "pc",
                "d": "0",
                "dq": "auto",
                "emb": emb,
                "fu": "0",
                "isvert": "0",
                "needbf": "2",
                "os": "win",
                "osv": "10",
                "p": "1",
                "partnerid": "null",
                "pver": "0.6.16",
                "rst": "mp4",
                "site": "1",
                "sver": "1.1",
                "vip": "0",
                "vs": "1",
                "wintype": "interior"
            },
            "steal_params": {
                "ccode": "0502",
                "client_ip": "192.168.1.1",
                "client_ts": str(round(time.time() * 1000)),
                "utid": utid,
                "version": "0.6.16",
                "ckey": "DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND"
            }
            }
    data_ = dict()
    # 将参数json化
    for key, value in data.items():
        data_[key] = json.dumps(value)
    return json.dumps(data_)


def get_all_parameter(data_, token=''):
    """
    获得所有参数
    :param data_: data参数
    :param token: token参数
    :return:
    """
    t = str(round(time.time() * 1000))
    data = {
        'jsv': '2.5.0',
        'appKey': '24679788',
        't': t,
        'api': 'mtop.youku.play.ups.appinfo.get',
        'v': '1.1',
        'sign': get_sign(t, data_, token),
        'timeout': '20000',
        'YKPid': '20160317PLF000211',
        'YKLoginRequest': 'true',
        'AntiFlood': 'true',
        'AntiCreep': 'true',
        'type': 'jsonp',
        'dataType': 'jsonp',
        'callback': 'mtopjsonp1',
        'data': data_
    }
    return data


def get_sign(t, data, token):
    """
    :param t: 13位时间戳
    :param data: 请求时的参数data
    :param token: 从cookie中获得
    :return: 加密后的sign
    """
    appKey = '24679788'
    sign = token + '&' + t + '&' + appKey + '&' + data
    md5 = hashlib.md5()
    md5.update(sign.encode('UTF-8'))
    sign = md5.hexdigest()
    return sign


def get_interface(url, utid, session, headers, proxies=None):
    # 支持代理和无代理两种请求
    """
    获取视频借口内容
    :param url: 视频连接
    :return:
    """
    data_ = get_data(url, utid)
    data = get_all_parameter(data_)
    url = 'https://acs.youku.com/h5/mtop.youku.play.ups.appinfo.get/1.1/'
    if not proxies:
        response = session.get(url, params=data, verify=False, headers=headers)
    else:
        response = session.get(url, params=data, verify=False, headers=headers, proxies=proxies)
    # print(response.text)
    if '令牌为空' in response.text:
        # 从cookie中获取token
        token = response.cookies['_m_h5_tk'].split('_')[0]
        print("获取token", token)
        data = get_all_parameter(data_, token=token)
        data['callback'] = 'mtopjsonp1'
        # 请求接口
        if not proxies:
            response = session.get(url, params=data, verify=False, headers=headers)
        else:
            response = session.get(url, params=data, verify=False, headers=headers, proxies=proxies)

    return response.text


def parse(url, utid):
    result = ''
    times = 1
    headers = {
        'Host': "acs.youku.com",
        'Connection': "keep-alive",
        'Pragma': "no-cache",
        'Cache-Control': "no-cache",
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/72.0.3626.119 Safari/537.36",
        'Accept': "*/*",
        'Referer': url,
        'Accept-Encoding': "gzip, deflate, br",
        'Accept-Language': "zh-CN,zh;q=0.9",
        'cache-control': "no-cache",
    }
    while times:
        session = requests.Session()
        proxies = {}
        time.sleep(5)
        try:
            result = get_interface(url, utid, session, headers)
        except:
            result = ''
        if len(result) > 10000:
            break
        else:
            times -= 1
    print(result)


def get_utid(proxies=None, u=None):
    t = int(time.time() * 1000)
    url = 'https://log.mmstat.com/eg.js?t={}'.format(t)
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Connection': 'keep-alive',
        'Host': 'log.mmstat.com',
        'Referer': 'https://v.youku.com/',
        'TE': 'Trailers',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
    }
    if u:
        cookie = 'cna={}'.format(u)
        headers['Cookie'] = cookie
    if proxies:
        resp = requests.get(url, headers=headers, proxies=proxies)
    else:
        resp = requests.get(url, headers=headers)
    print(resp.text)
    utid = re.search('Etag="(.*?)"', resp.text, re.S).group(1).strip()
    return utid


if __name__ == '__main__':
    utid = get_utid()
    print('未等待的utid,', utid)
    parse('https://v.youku.com/v_show/id_XNDA4NTM4NTQwMA==.html', utid)
    utid = get_utid()
    print('等待的utid,', utid)
    time.sleep(60)
    parse('https://v.youku.com/v_show/id_XNDA4NTM4NTQwMA==.html', utid)

上面写了一组对照,一个utid未经过sleep,一个经过sleep。结果如下:

破案了。 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值