爬取谷歌翻译时,tk参数js破解,用python复现
在爬取谷歌翻译时需要一些字段,其中tk字段相比其他会较难破解,利用了一些js逆向的知识找到了tk字段加密的js代码如下:
function tk(a) {
var k = "";
var b = 406644;
var b1 = 3293161072;
var jd = ".";
var $b = "+-a^+6";
var Zb = "+-3^+b+-f";
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var m = a.charCodeAt(g);
128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
e[f++] = m >> 18 | 240,
e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
e[f++] = m >> 6 & 63 | 128),
e[f++] = m & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++)
a += e[f], a = RL(a, $b);a = RL(a, Zb);a ^= b1 || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return a.toString() + jd + (a ^ b)
};
function RL(a, b) {
var t = "a";
var Yb = "+";
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2),
d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
}
return a
}
可以通过调用js来运行上述代码, 假设上面的代码保存在sign.js文件中:
import execjs
def get_sign(sentence):
f = open('sign.js', 'r')
js_data = f.read()
f.close()
sign = execjs.compile(js_data).call("tk", sentence)
return sign
上述代码可以实现调用,但是也有些小问题,在Linux上,调用运行js时,会调用系统的node环境来运行js文件,这样一来系统的cpu和内存占用会很多,并且如果多进程或多线程来做这个时,会加载许多许多的node来运行,这样特别占用资源,所以参考js的代码,本人用python复现了加密方式,避免了加载node环境运行js
加密参数中有些坑,比如js中靠计算溢出的出的结果,在python中同样的运算是不会溢出的,这里参考了一些方法来实现;再如js中的CharCodeAt和CodePointAt,与python中的ord功能虽有相似但不完全一样,这也是个坑,附python版代码:
import ctypes
def int_overflow(val):
maxint = 2147483647
if not -maxint - 1 <= val <= maxint:
val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
return val
def unsigned_right_shitf(n, i):
if n < 0:
n = ctypes.c_uint32(n).value
if i < 0:
return -int_overflow(n << abs(i))
return int_overflow(n >> i)
def left(n, i):
if n < 0:
n = ctypes.c_uint32(n).value
return int_overflow(n << i)
def RL(a, b):
t = "a"
Yb = "+"
c = 0
while c < len(b) - 2:
d = b[c + 2]
if d >= t:
d = ord(d[0]) - 87
else:
try:
d = int(d)
except:
d = float('NaN')
if b[c + 1] == Yb:
d = unsigned_right_shitf(a, d)
else:
# d = int_overflow(a << d)
d = left(a, d)
if b[c] == Yb:
a = int_overflow(a + d & 4294967295)
else:
a = int_overflow(a ^ d)
c += 3
return a
def tk(a):
b = 406644
b1 = 3293161072
jd = "."
cb = "+-a^+6"
Zb = "+-3^+b+-f"
e = []
f = 0
g = 0
while g < len(a):
m = ord(a[g])
if m < 128:
e.append(m)
f += 1
else:
if m < 2048:
am = m >> 6 | 192
e.append(am)
f += 1
e.append(m & 63 | 128)
f += 1
else:
if ord(a[g])>65553:
m = ord(a[g])
e.append(unsigned_right_shitf(m, 18) | 240)
f += 1
e.append(unsigned_right_shitf(m, 12) & 63 | 128)
f += 1
e.append(unsigned_right_shitf(m, 6) & 63 | 128)
f += 1
e.append(m & 63 | 128)
f += 1
else:
e.append(unsigned_right_shitf(m, 12) | 224)
f += 1
e.append(unsigned_right_shitf(m, 6) & 63 | 128)
f += 1
e.append(m & 63 | 128)
f += 1
g += 1
a = b
for i in e:
a += i
a = RL(a, cb)
a = RL(a, Zb)
if a:
a ^= b1
a = int_overflow(a)
else:
a = 0
if a < 0:
a = (a & 2147483647) + 2147483648
a %= 1E6
a = int(a)
return str(a) + jd + str((a ^ b))
直接调用tk()函数就可以了