一、观察网页源代码
目标网站:fanyi.baidu.com,输入需要查询的单词,出现如下界面
右键查看网页源代码:
ctrl+f搜索关键字 '苹果',搜索的结果为0,也就不能用requests直接请求数据了,数据是通过可能是加密或者通过js加载的的,因此需要对其进行数据加载的逻辑进行分析
二、抓包查看参数
重新点开fanyi.baidu.com进入到翻译界面,右键打开检查,勾选Network-->Fetch/XHR,在左边粘贴索要输入的单词,比如:apple
粘贴输入完成后出现了如下界面,挨个进行查看是否有翻译过后的内容
在sug的请求中发现有一些翻译的内容,需要注意的是,这里的翻译内容并非是我们需要的那个翻译
于是继续往下寻找,在v2transapi开头的请求中发现翻译的结果
查看请求的参数信息
请求方式为post,参数在payload下面查看
三、python发送请求获取数据
于是回到界面中编写代码,发送请求数据
import requests
url = r'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
data = {
'from': 'en',
'to': 'zh',
'query': 'apple',
'transtype': 'realtime',
'simple_means_flag': 3,
'sign': '704513.926512',
'token': '57ad27fdfc82112d8227dd87c55f633f',
'domain': 'common',
}
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Cookie':'__yjs_duid=1_0494dcb7db96dbf2e7eb8e643fb6ec9e1635902286958; REALTIME_TRANS_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=pleldUVEk1NGlDUmExNDNJSGYwWFFDcX5SaHhmNUg4OGk5b0l1Wm42UDFIckpoRUFBQUFBJCQAAAAAAAAAAAEAAADXXK1EMzE5Nzc2MTAydwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPWRimH1kYphR0; BDUSS_BFESS=pleldUVEk1NGlDUmExNDNJSGYwWFFDcX5SaHhmNUg4OGk5b0l1Wm42UDFIckpoRUFBQUFBJCQAAAAAAAAAAAEAAADXXK1EMzE5Nzc2MTAydwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPWRimH1kYphR0; BAIDUID=CE404BEB4FC3648E0C1F8B039B8EA0AC:FG=1; BIDUPSID=CE404BEB4FC3648E0C1F8B039B8EA0AC; PSTM=1638351819; BAIDUID_BFESS=CE404BEB4FC3648E0C1F8B039B8EA0AC:FG=1; APPGUIDE_10_0_2=1; H_PS_PSSID=35639_34441_35468_35104_31660_35628_35488_34584_35491_35688_35542_35665_35318_26350_35619_35562; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1641048772,1641049491; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1641090825; __yjs_st=2_YjYyODIwODU2MTUyOWZlYTY2MWViYTBhMTZkODczMWQxMzlhYjU1MzlmM2QzM2NiYjE3NGNjOTU0ZDgzNDk3ZGU1M2Y3NTZiNDJjZDViZGJkZWIxMTllYzE1MTBjM2FiOGIzMTI1MDBlZjBmNmY2MGNkNTM1YmEzYTYxZTlmMjBmNDcxMTVkOWNiM2M2ZWViNjRiYzRkYzk4NWE2ZTFiMjI2MTI2MWFjYTJhMTEyMTg5MDk2MDcxZTNhOGRlMDFkYjE3MzRiNmI0OTcxMTAwYmQ4ZGZmY2NjMGMyYTM4NjA0Y2UzNGZhMDRhMDIwYmVkNjljYzhmM2I1NTQ2Nzg0ZV83X2Q1OWE3ZTQ0; ab_sr=1.0.1_ODJhZWI3ZTc5MDRhZjA4OGUzYjM3YjdjMjlhNzM0OWUwYjM5ODBhNGVlNjg0MDY4NjI0NWI0NDZhMGQ4MDRhZDVlYTAzNDZjNzIzNDM4YzY4ZTNmZTRlZmY5MTFmMjU3MzkwYTVlMDQyMjJjODYzZTMzYjVmYjAxMzE2ODk3ZDUyYzI2ZTNlY2QwNTY0M2I3MmI4NTNmMWFkZTVmZTM3ZDk5YWJiNjA0ZDMxNjhlMTczYjkxZDE4MzQzMWZiZGVh'
}
if __name__ == '__main__':
resp = requests.post(url,headers=headers,data=data)
print(resp.json())
此处运行代码可以获取到返回的json数据,我们需要的结果也在json里边,接下来自行取结果就可以了
四、逆向,参数寻找
上边是针对apple这个单词的翻译,那么加入我们需要查询pig这个单词呢,每次修改参数去请求太麻烦了,继续分析:
可以发现当切换不同的单词的时候,请求的参数在跟着改变,将请求的data进行改变,重新访问,访问成功
经过不断变换参数可以发现影响结果的有两个参数,query和sign,其中query是我们输入的单词,sign是一串我们看不懂的数字,因此只要知道了sign这个参数,那么问题就解决了
重新回到浏览器,从上往下寻找参数
在第三个文件中发现目标
设置断点进行debug
鼠标点击1或者2进入到该函数中
五、编写js,本地调用
点击之后进入到如下界面,观察一下这个函数的上下可以发现它是一个闭包函数,可以通过将其改写成python代码实现,也可以将其作为js直接运行,此处使用第二种方式:
将整个函数内部copy出来新建一个js文件baidu.js
function a(r) {
if (Array.isArray(r)) {
for (var o = 0, t = Array(r.length); o < r.length; o++)
t[o] = r[o];
return t
}
return Array.from(r)
}
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
将刚刚的代码进行调整,加载这个js文件
import requests
import execjs
url = r'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
f = open("baidu.js", mode="r", encoding="utf-8")
obj = execjs.compile(f.read())
word = input('word:')
data = {
'from': 'en',
'to': 'zh',
'query': word,
'transtype': 'realtime',
'simple_means_flag': 3,
'sign': obj.call("e", word),
'token': '57ad27fdfc82112d8227dd87c55f633f',
'domain': 'common',
}
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Cookie':'__yjs_duid=1_0494dcb7db96dbf2e7eb8e643fb6ec9e1635902286958; REALTIME_TRANS_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=pleldUVEk1NGlDUmExNDNJSGYwWFFDcX5SaHhmNUg4OGk5b0l1Wm42UDFIckpoRUFBQUFBJCQAAAAAAAAAAAEAAADXXK1EMzE5Nzc2MTAydwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPWRimH1kYphR0; BDUSS_BFESS=pleldUVEk1NGlDUmExNDNJSGYwWFFDcX5SaHhmNUg4OGk5b0l1Wm42UDFIckpoRUFBQUFBJCQAAAAAAAAAAAEAAADXXK1EMzE5Nzc2MTAydwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPWRimH1kYphR0; BAIDUID=CE404BEB4FC3648E0C1F8B039B8EA0AC:FG=1; BIDUPSID=CE404BEB4FC3648E0C1F8B039B8EA0AC; PSTM=1638351819; BAIDUID_BFESS=CE404BEB4FC3648E0C1F8B039B8EA0AC:FG=1; APPGUIDE_10_0_2=1; H_PS_PSSID=35639_34441_35468_35104_31660_35628_35488_34584_35491_35688_35542_35665_35318_26350_35619_35562; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1641048772,1641049491; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1641090825; __yjs_st=2_YjYyODIwODU2MTUyOWZlYTY2MWViYTBhMTZkODczMWQxMzlhYjU1MzlmM2QzM2NiYjE3NGNjOTU0ZDgzNDk3ZGU1M2Y3NTZiNDJjZDViZGJkZWIxMTllYzE1MTBjM2FiOGIzMTI1MDBlZjBmNmY2MGNkNTM1YmEzYTYxZTlmMjBmNDcxMTVkOWNiM2M2ZWViNjRiYzRkYzk4NWE2ZTFiMjI2MTI2MWFjYTJhMTEyMTg5MDk2MDcxZTNhOGRlMDFkYjE3MzRiNmI0OTcxMTAwYmQ4ZGZmY2NjMGMyYTM4NjA0Y2UzNGZhMDRhMDIwYmVkNjljYzhmM2I1NTQ2Nzg0ZV83X2Q1OWE3ZTQ0; ab_sr=1.0.1_ODJhZWI3ZTc5MDRhZjA4OGUzYjM3YjdjMjlhNzM0OWUwYjM5ODBhNGVlNjg0MDY4NjI0NWI0NDZhMGQ4MDRhZDVlYTAzNDZjNzIzNDM4YzY4ZTNmZTRlZmY5MTFmMjU3MzkwYTVlMDQyMjJjODYzZTMzYjVmYjAxMzE2ODk3ZDUyYzI2ZTNlY2QwNTY0M2I3MmI4NTNmMWFkZTVmZTM3ZDk5YWJiNjA0ZDMxNjhlMTczYjkxZDE4MzQzMWZiZGVh'
}
if __name__ == '__main__':
resp = requests.post(url,headers=headers,data=data)
print(resp.json())
运行之后会报如下错误:
六、调整js文件
返回刚刚编写的js文件:
发现有个window,在nodejs环境中没有浏览器的window概念,因此需要手工添加一个window,
将上一行的lcopy出来放到浏览器中去运行
这个gtk的值在浏览器中搜索关键字获取
gtk对应的值为'320305.131321201'
在baidu.js文件开头添加:
并修改js代码
修改后的js完整代码:
var window={
'gtk':'320305.131321201'
};
function a(r) {
if (Array.isArray(r)) {
for (var o = 0, t = Array(r.length); o < r.length; o++)
t[o] = r[o];
return t
}
return Array.from(r)
}
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0;
// , l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window['gtk'] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
var i = null;
七、执行py文件获取结果
重新运行py文件,成功获取结果(后续json处理此处省略)