最近在做关于淘宝买家秀的爬虫,其中无非就是关于sign的生成相关的几个点。这里我来介绍下自己总结的几个点。
1.数据
如图。淘宝传下来的数据存在js文件中
2.参数
appKey: 12574478
t: 1560094920983
sign: 9fd51773ab6c80205f4a0c2f97ca14c6
api: mtop.taobao.social.feed.aggregate
v: 1.0
timeout: 300000
timer: 300000
type: jsonp
dataType: jsonp
callback: mtopjsonp1
data: {"params":"{\"nodeId\":\"\",\"sellerId\":\"50852803\"}","cursor":"1","pageNum":"1","pageId":5703,"env":"1"}
其中:
t 和 sign是动态生成的。其他都为固定字段
关于 t:
等于当前的时间戳乘以1000取整再转字符串
# 获取当前时间戳
t = str(int(time.time() * 1000))
关于sign:
通过全文搜索sign,在这个js代码中找到了sign的生成方法,
sign = md5(token + '&' + t + '&' + appKey + '&' + data)
关于上面拼接字符串的几个参数解释一下:
token : cookies中的_m_h5_值加上时间戳,如:
_m_h5_tk=2836432b51e57711db70ca4984ee0cc6_1560102469721
那么:token就为2836432b51e57711db70ca4984ee0cc6
t : 当前时间戳
appkey : 固定值可以在js代码中找到比如我的是 12574478
data :需要查询的参数传递的参数data的值:(这里需要注意一下)
比如浏览器中是这样:
data:{"params":"{\"nodeId\":\"\",\"sellerId\":\"50852803\"}","cursor":"1","pageNum":"1","pageId":5703,"env":"1"}
那么python代码中就应该在每个斜杠前再加一个斜杠,如下:
data:{"params":"{\\"nodeId\\":\\"\\",\\"sellerId\\":\\"50852803\\"}","cursor":"1","pageNum":"1","pageId":5703,"env":"1"}
在这里我当初就是忽略了这点,导致md5码一直生成不对,卡了很久。
在上图中g(token + '&' + t + '&' + appKey + '&' + data),其实就是将拼接的字符串生成32位md5码,通过断点找出该源码:
md5 js:代码:
function g(a) {
function b(a, b) {
return a << b | a >>> 32 - b
}
function c(a, b) {
var c, d, e, f, g;
return e = 2147483648 & a, f = 2147483648 & b, c = 1073741824 & a, d = 1073741824 & b, g = (1073741823 & a) + (1073741823 & b), c & d ? 2147483648 ^ g ^ e ^ f : c | d ? 1073741824 & g ? 3221225472 ^ g ^ e ^ f : 1073741824 ^ g ^ e ^ f : g ^ e ^ f
}
function d(a, b, c) {
return a & b | ~a & c
}
function e(a, b, c) {
return a & c | b & ~c
}
function f(a, b, c) {
return a ^ b ^ c
}
function g(a, b, c) {
return b ^ (a | ~c)
}
function h(a, e, f, g, h, i, j) {
return a = c(a, c(c(d(e, f, g), h), j)), c(b(a, i), e)
}
function i(a, d, f, g, h, i, j) {
return a = c(a, c(c(e(d, f, g), h), j)), c(b(a, i), d)
}
function j(a, d, e, g, h, i, j) {
return a = c(a, c(c(f(d, e, g), h), j)), c(b(a, i), d)
}
function k(a, d, e, f, h, i, j) {
return a = c(a, c(c(g(d, e, f), h), j)), c(b(a, i), d)
}
function l(a) {
for (var b, c = a.length, d = c + 8, e = (d - d % 64) / 64, f = 16 * (e + 1), g = new Array(f - 1), h = 0, i = 0; c > i;) b = (i - i % 4) / 4, h = i % 4 * 8, g[b] = g[b] | a.charCodeAt(i) << h, i++;
return b = (i - i % 4) / 4, h = i % 4 * 8, g[b] = g[b] | 128 << h, g[f - 2] = c << 3, g[f - 1] = c >>> 29, g
}
function m(a) {
var b, c, d = "", e = "";
for (c = 0; 3 >= c; c++) b = a >>> 8 * c & 255, e = "0" + b.toString(16), d += e.substr(e.length - 2, 2);
return d
}
function n(a) {
a = a.replace(/\r\n/g, "\n");
for (var b = "", c = 0; c < a.length; c++) {
var d = a.charCodeAt(c);
128 > d ? b += String.fromCharCode(d) : d > 127 && 2048 > d ? (b += String.fromCharCode(d >> 6 | 192), b += String.fromCharCode(63 & d | 128)) : (b += String.fromCharCode(d >> 12 | 224), b += String.fromCharCode(d >> 6 & 63 | 128), b += String.fromCharCode(63 & d | 128))
}
return b
}
var o, p, q, r, s, t, u, v, w, x = [], y = 7, z = 12, A = 17, B = 22, C = 5, D = 9, E = 14, F = 20, G = 4,
H = 11, I = 16, J = 23, K = 6, L = 10, M = 15, N = 21;
for (a = n(a), x = l(a), t = 1732584193, u = 4023233417, v = 2562383102, w = 271733878, o = 0; o < x.length; o += 16) p = t, q = u, r = v, s = w, t = h(t, u, v, w, x[o + 0], y, 3614090360), w = h(w, t, u, v, x[o + 1], z, 3905402710), v = h(v, w, t, u, x[o + 2], A, 606105819), u = h(u, v, w, t, x[o + 3], B, 3250441966), t = h(t, u, v, w, x[o + 4], y, 4118548399), w = h(w, t, u, v, x[o + 5], z, 1200080426), v = h(v, w, t, u, x[o + 6], A, 2821735955), u = h(u, v, w, t, x[o + 7], B, 4249261313), t = h(t, u, v, w, x[o + 8], y, 1770035416), w = h(w, t, u, v, x[o + 9], z, 2336552879), v = h(v, w, t, u, x[o + 10], A, 4294925233), u = h(u, v, w, t, x[o + 11], B, 2304563134), t = h(t, u, v, w, x[o + 12], y, 1804603682), w = h(w, t, u, v, x[o + 13], z, 4254626195), v = h(v, w, t, u, x[o + 14], A, 2792965006), u = h(u, v, w, t, x[o + 15], B, 1236535329), t = i(t, u, v, w, x[o + 1], C, 4129170786), w = i(w, t, u, v, x[o + 6], D, 3225465664), v = i(v, w, t, u, x[o + 11], E, 643717713), u = i(u, v, w, t, x[o + 0], F, 3921069994), t = i(t, u, v, w, x[o + 5], C, 3593408605), w = i(w, t, u, v, x[o + 10], D, 38016083), v = i(v, w, t, u, x[o + 15], E, 3634488961), u = i(u, v, w, t, x[o + 4], F, 3889429448), t = i(t, u, v, w, x[o + 9], C, 568446438), w = i(w, t, u, v, x[o + 14], D, 3275163606), v = i(v, w, t, u, x[o + 3], E, 4107603335), u = i(u, v, w, t, x[o + 8], F, 1163531501), t = i(t, u, v, w, x[o + 13], C, 2850285829), w = i(w, t, u, v, x[o + 2], D, 4243563512), v = i(v, w, t, u, x[o + 7], E, 1735328473), u = i(u, v, w, t, x[o + 12], F, 2368359562), t = j(t, u, v, w, x[o + 5], G, 4294588738), w = j(w, t, u, v, x[o + 8], H, 2272392833), v = j(v, w, t, u, x[o + 11], I, 1839030562), u = j(u, v, w, t, x[o + 14], J, 4259657740), t = j(t, u, v, w, x[o + 1], G, 2763975236), w = j(w, t, u, v, x[o + 4], H, 1272893353), v = j(v, w, t, u, x[o + 7], I, 4139469664), u = j(u, v, w, t, x[o + 10], J, 3200236656), t = j(t, u, v, w, x[o + 13], G, 681279174), w = j(w, t, u, v, x[o + 0], H, 3936430074), v = j(v, w, t, u, x[o + 3], I, 3572445317), u = j(u, v, w, t, x[o + 6], J, 76029189), t = j(t, u, v, w, x[o + 9], G, 3654602809), w = j(w, t, u, v, x[o + 12], H, 3873151461), v = j(v, w, t, u, x[o + 15], I, 530742520), u = j(u, v, w, t, x[o + 2], J, 3299628645), t = k(t, u, v, w, x[o + 0], K, 4096336452), w = k(w, t, u, v, x[o + 7], L, 1126891415), v = k(v, w, t, u, x[o + 14], M, 2878612391), u = k(u, v, w, t, x[o + 5], N, 4237533241), t = k(t, u, v, w, x[o + 12], K, 1700485571), w = k(w, t, u, v, x[o + 3], L, 2399980690), v = k(v, w, t, u, x[o + 10], M, 4293915773), u = k(u, v, w, t, x[o + 1], N, 2240044497), t = k(t, u, v, w, x[o + 8], K, 1873313359), w = k(w, t, u, v, x[o + 15], L, 4264355552), v = k(v, w, t, u, x[o + 6], M, 2734768916), u = k(u, v, w, t, x[o + 13], N, 1309151649), t = k(t, u, v, w, x[o + 4], K, 4149444226), w = k(w, t, u, v, x[o + 11], L, 3174756917), v = k(v, w, t, u, x[o + 2], M, 718787259), u = k(u, v, w, t, x[o + 9], N, 3951481745), t = c(t, p), u = c(u, q), v = c(v, r), w = c(w, s);
var O = m(t) + m(u) + m(v) + m(w);
return O.toLowerCase()
}
当然有python更加方便的方式:
python生成MD5码:
def hex_md5(s):
m = hashlib.md5()
m.update(s.encode('UTF-8'))
return m.hexdigest()
关于怎么获取_m_h5_tk和_m_h5_tk_enc:
可以先发起一个不带cookie的请求,然后该接口就会返回一个令牌为空的错误。并且设置该响应cookies为
<RequestsCookieJar[<Cookie _m_h5_tk=9dade45ab1b2eb31ee7091acdc91db7b_1560105148770 for .taobao.com/>, <Cookie _m_h5_tk_enc=fb12ea6b463d70c9f907f9cbc962d6f5 for .taobao.com/>, <Cookie t=6cb12234d6e930fb198dbb19e73e3655 for .taobao.com/>]>
代码:
params = {
'appKey': appKey,
'data': data
}
# 请求空获取cookies
html = requests.get(url, params=params)
m_h5_tk = html.cookies['_m_h5_tk']
m_h5_tk_enc = html.cookies['_m_h5_tk_enc']
其中params是必不可少的,然后可以根据 token = _m_h5_tk.split('_')[0] 得到token;
然后再根据已知的数据构造请求,就可得到数据;
至此,该请求大概都已分析完毕,上代码!!!:
import hashlib
import json
import time
import requests
import pymysql as mdb
def hex_md5(s):
m = hashlib.md5()
m.update(s.encode('UTF-8'))
return m.hexdigest()
def get_page(index, num):
url = 'https://acs.m.taobao.com/h5/mtop.taobao.social.feed.aggregate/1.0/'
appKey = '12574478'
# 获取当前时间戳
t = str(int(time.time() * 1000))
data = '{"params":"{\\"nodeId\\":\\"\\",\\"sellerId\\":\\"50852803\\",\\"pagination\\":{\\"direction\\":\\"1\\",\\"hasMore\\":\\"true\\",\\"pageNum\\":\\"' + str(
index) + '\\",\\"pageSize\\":\\"' + str(num) + '\\"}}","cursor":"' + str(
index) + '","pageNum":"' + str(
index) + '","pageId":5703,"env":"1"}'
params = {
'appKey': appKey,
'data': data
}
# 请求空获取cookies
html = requests.get(url, params=params)
_m_h5_tk = html.cookies['_m_h5_tk']
_m_h5_tk_enc = html.cookies['_m_h5_tk_enc']
token = _m_h5_tk.split('_')[0]
cookie_t = html.cookies['t']
u = token + '&' + t + '&' + appKey + '&' + data
# MD5加密
sign = hex_md5(u)
print('秘钥:' + sign)
# 设置第二次请求的cookie
headers = {
'cookie': '_m_h5_tk=' + _m_h5_tk + '; _m_h5_tk_enc=' + _m_h5_tk_enc,
}
params = {
'appKey': appKey,
't': t,
'sign': sign,
'data': data
}
html = requests.get(url, headers=headers, params=params)
item = json.loads(html.text)
# 第一页有21条,第一条无用
for i in item['data']['list'][-num:]:
print(i)
get_page(2, 20)
结果:
有问题可联系qq :1124241615