我们在爬虫时经常会遇到一些奇怪的参数,比如百度翻译的sign,网易云音乐的params等,这个时候就要用js逆向的技术来获取参数的构造方法
前置准备
Chorme浏览器,Sublime编译器,Python
爬取链接:https://fanyi.baidu.com/
抓包
按下F12打开Chrome的开发者什么玩意,选中Network——XHR,然后在翻译框中随便输入一个单词,看弹出的请求
查看一下请求,发现在链接为https://fanyi.baidu.com/v2transapi?from=en&to=zh的请求中找到了翻译结果
简单看一下请求,这是一个post请求,传递了
- from: en
- to: zh
- query: python
- transtype: translang
- simple_means_flag: 3
- sign: 477811.239938
- token: aa76a1ea4d6f12324eb78d55aacce9fc
- domain: common
这几个参数,其中from是输入的语言简称,to是翻译语言的简称,query是所要翻译的单词,transtype、simple_means_flag、token、domain都是固定值,那就是这个sign比较蹊跷了,又比较了几个单词,发现不同的内容有不同的sign,那么sign是怎么得到的?
JS逆向
打开Source界面,按下Ctrl+Shift+F全局搜索sign,搜索结果如下
盲猜在链接为https://fanyi-cdn.cdn.bcebos.com/static/translation/pkg/index_88a11cb.js的js文件里,打开文件,但这个代码。。。明显不是人能看的,按下左下角的**{}**将代码格式化
然后按下Ctrl+F输入sign进行搜索,发现有11个结果
我们一个个来看
发现第8个结果好像是所需的代码
这里说sign是由 y 函数生成的,我们在这一行打个断点,刷新一下界面,重新输入单词,开始debug
把光标悬停在参数n上,看见参数n是我们输入的内容
把光标悬停在y上,自动弹出了真实的函数
点一下蓝色字部分,就跳转到了e函数,这就是生成sign的函数了
在sublime中新建一个js文件,把e函数的内容Ctrl+C,Ctrl+V粘贴到js文件里,在做一些加工,代码如下
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
var result=e("python");
console.log(result);
运行。报错了
提示i不是变量,再回浏览器中调试查看
发现i是等于"320305.131321201"的,把它复制到我们的js文件中,简单修改一下
var i="320305.131321201";
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
var result=e("python");
console.log(result);
运行。又报错。。。
提示n不是变量,回到浏览器继续调试
点击蓝色字体传送到函数n,如下
Ctrl+C,Ctrl+V复制到sublime里,再运行终于是成功了
Python代码实现
完整代码如下
import requests
import execjs
import re
class Baidu:
def __init__(self,word):
self.word=word
self.have_chinese=self.__judge(word)
self.url=f"https://fanyi.baidu.com/v2transapi?from={self.have_chinese['from']}&to={self.have_chinese['to']}"
self.headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
'referer': 'https://fanyi.baidu.com/',
'cookie': 'BIDUPSID=197A77026320029B98FCDD44053FA925; PSTM=1595830559; BAIDUID=197A77026320029B89D159CA4047B0CA:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_SPD_SWITCH=1; HISTORY_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=148078_145996_150839_150866_150967_149355_150075_147088_150083_151595_148867_148714_150746_147279_150036_150025_151015_146573_148524_151033_127969_146549_152505_149718_146653_151319_151953_146732_145788_152740_149213_131423_152015_146500_144659_132547_147588_152582_107318_151584_149253_152380_140368_152152_144966_152513_146786_152457_150341_151542_152249_147546_148868_151703_110085; BDUSS=JxcDFlUXpSUU9DWERMMnpOWmk0UTBFbVFNdjdQdjhYRDc4anA1RldYZTVMRTVmRVFBQUFBJCQAAAAAAAAAAAEAAAA7Kwn4aGhmaGZmbHkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALmfJl-5nyZfMl; BDUSS_BFESS=JxcDFlUXpSUU9DWERMMnpOWmk0UTBFbVFNdjdQdjhYRDc4anA1RldYZTVMRTVmRVFBQUFBJCQAAAAAAAAAAAEAAAA7Kwn4aGhmaGZmbHkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALmfJl-5nyZfMl; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1596547981,1596642955,1596643249,1596676613; H_PS_PSSID=32292_1446_31672_32357_31660_32045_32393_32429_32117_32481_22160; delPer=0; PSINO=6; __yjsv5_shitong=1.0_7_fab021846d809f84d7f379a4a05d4bb7647a_300_1596716294909_175.4.247.37_fbd31867; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1596716295; yjs_js_security_passport=2899009a122b9c61dea15b2d13eef2d122631d98_1596716295_js',
}
def __judge(self,word):
reg=re.compile('[\u4e00-\u9fa5]')
if re.search(reg,word)!=None:
return {'from':'zh','to':'en'}
else:
return {'from':'en','to':'zh'}
def __get_sign(self,word):
string=execjs.compile(r'''
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
var i="320305.131321201";
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
''')
return string.call('e',word)
def __get_token(self):
r=requests.get("https://fanyi.baidu.com/",headers=self.headers).text
return re.findall(re.compile("token: '(.*?)',"),r)[0]
def __get_data(self):
sign=self.__get_sign(self.word)
token=self.__get_token()
data={
'query': self.word,
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': sign,
'token': token,
'domain': 'common',
}
data.update(self.have_chinese)
return data
def run(self):
data=self.__get_data()
r=requests.post(self.url,headers=self.headers,data=data).json()
result=r['dict_result']['collins']['entry'][0]['value'][0]['tran']
return result
if __name__ == '__main__':
word=input(">>>")
b=Baidu(word)
print(b.run())