数据采集--某度翻译全流程

一、观察网页源代码

目标网站:fanyi.baidu.com,输入需要查询的单词,出现如下界面

右键查看网页源代码:

ctrl+f搜索关键字 '苹果',搜索的结果为0,也就不能用requests直接请求数据了,数据是通过可能是加密或者通过js加载的的,因此需要对其进行数据加载的逻辑进行分析

二、抓包查看参数

重新点开fanyi.baidu.com进入到翻译界面,右键打开检查,勾选Network-->Fetch/XHR,在左边粘贴索要输入的单词,比如:apple

粘贴输入完成后出现了如下界面,挨个进行查看是否有翻译过后的内容

在sug的请求中发现有一些翻译的内容,需要注意的是,这里的翻译内容并非是我们需要的那个翻译

于是继续往下寻找,在v2transapi开头的请求中发现翻译的结果

查看请求的参数信息

请求方式为post,参数在payload下面查看

三、python发送请求获取数据

于是回到界面中编写代码,发送请求数据

import requests


url = r'https://fanyi.baidu.com/v2transapi?from=en&to=zh'

data = {
    'from': 'en',
    'to': 'zh',
    'query': 'apple',
    'transtype': 'realtime',
    'simple_means_flag': 3,
    'sign': '704513.926512',
    'token': '57ad27fdfc82112d8227dd87c55f633f',
    'domain': 'common',
}

headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Cookie':'__yjs_duid=1_0494dcb7db96dbf2e7eb8e643fb6ec9e1635902286958; REALTIME_TRANS_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=pleldUVEk1NGlDUmExNDNJSGYwWFFDcX5SaHhmNUg4OGk5b0l1Wm42UDFIckpoRUFBQUFBJCQAAAAAAAAAAAEAAADXXK1EMzE5Nzc2MTAydwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPWRimH1kYphR0; BDUSS_BFESS=pleldUVEk1NGlDUmExNDNJSGYwWFFDcX5SaHhmNUg4OGk5b0l1Wm42UDFIckpoRUFBQUFBJCQAAAAAAAAAAAEAAADXXK1EMzE5Nzc2MTAydwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPWRimH1kYphR0; BAIDUID=CE404BEB4FC3648E0C1F8B039B8EA0AC:FG=1; BIDUPSID=CE404BEB4FC3648E0C1F8B039B8EA0AC; PSTM=1638351819; BAIDUID_BFESS=CE404BEB4FC3648E0C1F8B039B8EA0AC:FG=1; APPGUIDE_10_0_2=1; H_PS_PSSID=35639_34441_35468_35104_31660_35628_35488_34584_35491_35688_35542_35665_35318_26350_35619_35562; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1641048772,1641049491; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1641090825; __yjs_st=2_YjYyODIwODU2MTUyOWZlYTY2MWViYTBhMTZkODczMWQxMzlhYjU1MzlmM2QzM2NiYjE3NGNjOTU0ZDgzNDk3ZGU1M2Y3NTZiNDJjZDViZGJkZWIxMTllYzE1MTBjM2FiOGIzMTI1MDBlZjBmNmY2MGNkNTM1YmEzYTYxZTlmMjBmNDcxMTVkOWNiM2M2ZWViNjRiYzRkYzk4NWE2ZTFiMjI2MTI2MWFjYTJhMTEyMTg5MDk2MDcxZTNhOGRlMDFkYjE3MzRiNmI0OTcxMTAwYmQ4ZGZmY2NjMGMyYTM4NjA0Y2UzNGZhMDRhMDIwYmVkNjljYzhmM2I1NTQ2Nzg0ZV83X2Q1OWE3ZTQ0; ab_sr=1.0.1_ODJhZWI3ZTc5MDRhZjA4OGUzYjM3YjdjMjlhNzM0OWUwYjM5ODBhNGVlNjg0MDY4NjI0NWI0NDZhMGQ4MDRhZDVlYTAzNDZjNzIzNDM4YzY4ZTNmZTRlZmY5MTFmMjU3MzkwYTVlMDQyMjJjODYzZTMzYjVmYjAxMzE2ODk3ZDUyYzI2ZTNlY2QwNTY0M2I3MmI4NTNmMWFkZTVmZTM3ZDk5YWJiNjA0ZDMxNjhlMTczYjkxZDE4MzQzMWZiZGVh'
}


if __name__ == '__main__':

    resp = requests.post(url,headers=headers,data=data)
    print(resp.json())

此处运行代码可以获取到返回的json数据,我们需要的结果也在json里边,接下来自行取结果就可以了

四、逆向,参数寻找

上边是针对apple这个单词的翻译,那么加入我们需要查询pig这个单词呢,每次修改参数去请求太麻烦了,继续分析:

可以发现当切换不同的单词的时候,请求的参数在跟着改变,将请求的data进行改变,重新访问,访问成功

经过不断变换参数可以发现影响结果的有两个参数,query和sign,其中query是我们输入的单词,sign是一串我们看不懂的数字,因此只要知道了sign这个参数,那么问题就解决了

重新回到浏览器,从上往下寻找参数

在第三个文件中发现目标

设置断点进行debug

鼠标点击1或者2进入到该函数中

五、编写js,本地调用

点击之后进入到如下界面,观察一下这个函数的上下可以发现它是一个闭包函数,可以通过将其改写成python代码实现,也可以将其作为js直接运行,此处使用第二种方式:

将整个函数内部copy出来新建一个js文件baidu.js

function a(r) {
    if (Array.isArray(r)) {
        for (var o = 0, t = Array(r.length); o < r.length; o++)
            t[o] = r[o];
        return t
    }
    return Array.from(r)
}

function n(r, o) {
    for (var t = 0; t < o.length - 2; t += 3) {
        var a = o.charAt(t + 2);
        a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
            a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
            r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
    }
    return r
}

function e(r) {
    var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
    if (null === o) {
        var t = r.length;
        t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
    } else {
        for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
            "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
            C !== h - 1 && f.push(o[C]);
        var g = f.length;
        g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
    }
    var u = void 0
        , l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
    u = null !== i ? i : (i = window[l] || "") || "";
    for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
        var A = r.charCodeAt(v);
        128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
            S[c++] = A >> 18 | 240,
            S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
            S[c++] = A >> 6 & 63 | 128),
            S[c++] = 63 & A | 128)
    }
    for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
        p += S[b],
            p = n(p, F);
    return p = n(p, D),
        p ^= s,
    0 > p && (p = (2147483647 & p) + 2147483648),
        p %= 1e6,
    p.toString() + "." + (p ^ m)
}

将刚刚的代码进行调整,加载这个js文件

import requests
import execjs

url = r'https://fanyi.baidu.com/v2transapi?from=en&to=zh'

f = open("baidu.js", mode="r", encoding="utf-8")

obj = execjs.compile(f.read())
word = input('word:')
data = {
    'from': 'en',
    'to': 'zh',
    'query': word,
    'transtype': 'realtime',
    'simple_means_flag': 3,
    'sign': obj.call("e", word),
    'token': '57ad27fdfc82112d8227dd87c55f633f',
    'domain': 'common',
}

headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Cookie':'__yjs_duid=1_0494dcb7db96dbf2e7eb8e643fb6ec9e1635902286958; REALTIME_TRANS_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=pleldUVEk1NGlDUmExNDNJSGYwWFFDcX5SaHhmNUg4OGk5b0l1Wm42UDFIckpoRUFBQUFBJCQAAAAAAAAAAAEAAADXXK1EMzE5Nzc2MTAydwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPWRimH1kYphR0; BDUSS_BFESS=pleldUVEk1NGlDUmExNDNJSGYwWFFDcX5SaHhmNUg4OGk5b0l1Wm42UDFIckpoRUFBQUFBJCQAAAAAAAAAAAEAAADXXK1EMzE5Nzc2MTAydwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPWRimH1kYphR0; BAIDUID=CE404BEB4FC3648E0C1F8B039B8EA0AC:FG=1; BIDUPSID=CE404BEB4FC3648E0C1F8B039B8EA0AC; PSTM=1638351819; BAIDUID_BFESS=CE404BEB4FC3648E0C1F8B039B8EA0AC:FG=1; APPGUIDE_10_0_2=1; H_PS_PSSID=35639_34441_35468_35104_31660_35628_35488_34584_35491_35688_35542_35665_35318_26350_35619_35562; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1641048772,1641049491; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1641090825; __yjs_st=2_YjYyODIwODU2MTUyOWZlYTY2MWViYTBhMTZkODczMWQxMzlhYjU1MzlmM2QzM2NiYjE3NGNjOTU0ZDgzNDk3ZGU1M2Y3NTZiNDJjZDViZGJkZWIxMTllYzE1MTBjM2FiOGIzMTI1MDBlZjBmNmY2MGNkNTM1YmEzYTYxZTlmMjBmNDcxMTVkOWNiM2M2ZWViNjRiYzRkYzk4NWE2ZTFiMjI2MTI2MWFjYTJhMTEyMTg5MDk2MDcxZTNhOGRlMDFkYjE3MzRiNmI0OTcxMTAwYmQ4ZGZmY2NjMGMyYTM4NjA0Y2UzNGZhMDRhMDIwYmVkNjljYzhmM2I1NTQ2Nzg0ZV83X2Q1OWE3ZTQ0; ab_sr=1.0.1_ODJhZWI3ZTc5MDRhZjA4OGUzYjM3YjdjMjlhNzM0OWUwYjM5ODBhNGVlNjg0MDY4NjI0NWI0NDZhMGQ4MDRhZDVlYTAzNDZjNzIzNDM4YzY4ZTNmZTRlZmY5MTFmMjU3MzkwYTVlMDQyMjJjODYzZTMzYjVmYjAxMzE2ODk3ZDUyYzI2ZTNlY2QwNTY0M2I3MmI4NTNmMWFkZTVmZTM3ZDk5YWJiNjA0ZDMxNjhlMTczYjkxZDE4MzQzMWZiZGVh'
}


if __name__ == '__main__':

    resp = requests.post(url,headers=headers,data=data)
    print(resp.json())

运行之后会报如下错误:

六、调整js文件

返回刚刚编写的js文件:

发现有个window,在nodejs环境中没有浏览器的window概念,因此需要手工添加一个window,

将上一行的lcopy出来放到浏览器中去运行

这个gtk的值在浏览器中搜索关键字获取

gtk对应的值为'320305.131321201'

在baidu.js文件开头添加:

并修改js代码

修改后的js完整代码:

var window={
    'gtk':'320305.131321201'
};

function a(r) {
    if (Array.isArray(r)) {
        for (var o = 0, t = Array(r.length); o < r.length; o++)
            t[o] = r[o];
        return t
    }
    return Array.from(r)
}

function n(r, o) {
    for (var t = 0; t < o.length - 2; t += 3) {
        var a = o.charAt(t + 2);
        a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
            a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
            r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
    }
    return r
}

function e(r) {
    var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
    if (null === o) {
        var t = r.length;
        t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
    } else {
        for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
            "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
            C !== h - 1 && f.push(o[C]);
        var g = f.length;
        g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
    }
    var u = void 0;
        // , l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
    u = null !== i ? i : (i = window['gtk'] || "") || "";
    for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
        var A = r.charCodeAt(v);
        128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
            S[c++] = A >> 18 | 240,
            S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
            S[c++] = A >> 6 & 63 | 128),
            S[c++] = 63 & A | 128)
    }
    for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
        p += S[b],
            p = n(p, F);
    return p = n(p, D),
        p ^= s,
    0 > p && (p = (2147483647 & p) + 2147483648),
        p %= 1e6,
    p.toString() + "." + (p ^ m)
}

var i = null;

七、执行py文件获取结果

重新运行py文件,成功获取结果(后续json处理此处省略)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值