目录
Package安装
使用requests进行获取请求URL的结果,MD5加密,AES解密对应的包安装,具体如下:
$ pip install requests hashlib pycryptodome
爬取有道词典
有道词典网页地址:网易有道
操作步骤
有道词典爬虫的操作步骤,具体如下:
- 观察网页变化查找URL和请求参数:
-
- 内容响应的URL:https://dict.youdao.com/jsonapi_s?doctype=json&jsonversion=4
-
- 请求参数:From Data中只有t和sign有变化
-
- 通过在网页的JS文件中去查找t和sign,明确了t为时间戳,sign为加密标志。通过t的变化,md5加密生成sign
-
- 请求参数参照js文件进行构造如下:
from hashlib import md5
def get_form_data(text, le):
"""
构建表单参数
:param :text:翻译内容
:param :le:目标语言
"""
# 固定值
w = 'Mk6hqtUp33DGGtoS63tTJbMUYjRrG1Lu'
v = 'webdict'
_ = 'web'
r = text + v
time = len(r) % 10
o = md5(r.encode('utf-8')).hexdigest()
n = _ + text + str(time) + w + o
f = md5(n.encode('utf-8')).hexdigest()
form_data = {
'q': text,
'le': le,
't': time,
'client': _,
'sign': f,
'keyfrom': v,
}
return form_data
- heads的确定:通过postman进行确定请求头head的组成
-
- 选中URL,鼠标右键Copy as cURL(bash)
-
- 在postman中进行粘贴URL,测试哪些是有影响的head头:
案例代码
根据以上的操作后,最终代码如下:
from hashlib import md5
import requests
def get_form_data(text, le):
"""
构建表单参数
:param :text:翻译内容
:param :le:目标语言
"""
# 固定值
w = 'Mk6hqtUp33DGGtoS63tTJbMUYjRrG1Lu'
v = 'webdict'
_ = 'web'
r = text + v
time = len(r) % 10
o = md5(r.encode('utf-8')).hexdigest()
n = _ + text + str(time) + w + o
f = md5(n.encode('utf-8')).hexdigest()
form_data = {
'q': text,
'le': le,
't': time,
'client': _,
'sign': f,
'keyfrom': v,
}
return form_data
def translate(query, to_lan):
"""
启动翻译
:param query: 翻译内容
:param to_lan: 目标语言
:return:
"""
# 有道词典网页请求参数
url = 'https://dict.youdao.com/jsonapi_s?doctype=json&jsonversion=4'
form_data = get_form_data(query, to_lan)
try:
res = requests.post(url, data=form_data).json()
# 取第一个网络释义
result = res['web_trans']['web-translation'][0]['trans'][0]['value']
return result
except Exception as e:
print('翻译失败:', e)
return '翻译失败:' + query
if __name__ == '__main__':
"""
# 有道词典语言选项
lang = {
'自动检测语言': '',
'中英': 'en',
'中法': 'fr',
'中韩': 'ko',
'中日': 'ja',
}
"""
word = input("请输入你要翻译的文字: ")
# ret = translate('早上好', 'ja')
# ret = translate('你好', 'fr')
# ret = translate('你好', 'ko')
# ret = translate('你好', '')
# ret = translate('你好', 'en')
ret = translate(word, 'ja')
print('翻译结果:\n', ret)
爬取有道翻译--新版
有道翻译网页地址:有道翻译_文本、文档、网页、在线即时翻译
操作步骤
有道翻译爬虫的操作步骤,具体如下:
- 观察网页变化查找URL和请求参数:
-
- 内容响应的URL:https://dict.youdao.com/webtranslate
-
- 请求参数:Form Data中只有sign和mysticTime有变化
-
- 通过在网页的JS文件中去查找mysticTime和sign,明确了mysticTime为时间戳,sign为加密标志。通过t的变化,md5加密生成sign
-
- 请求参数参照js文件进行构造如下:
from hashlib import md5
def get_form_data(sentence, from_lang, to_lang):
"""
构建表单参数
:param :sentence:翻译内容
:param from_lang:源语言
:param to_lang:目标语言
:return:
"""
e = 'fsdsogkndfokasodnaso'
d = 'fanyideskweb'
u = 'webfanyi'
m = 'client,mysticTime,product'
p = '1.0.0'
b = 'web'
f = 'fanyi.web'
t = time.time()
query = {
'client': d,
'mysticTime': t,
'product': u,
'key': e
}
# 获取sign值 - -密钥值
h = hashlib.md5(urlencode(query).encode('utf-8')).hexdigest()
form_data = {
'i': sentence,
'from': from_lang,
'to': to_lang,
'domain': 0,
'dictResult': 'true',
'keyid': u,
'sign': h,
'client': d,
'product': u,
'appVersion': p,
'vendor': b,
'pointParam': m,
'mysticTime': t,
'keyfrom': f
}
return form_data
- 通过AES进行解密翻译结果:
-
- 在js文件中找到key和iv值
AES对应的package安装:
$ pip install pycryptodome
AES的类构造如下:
import hashlib
import base64
from urllib.parse import urlencode
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad, pad
class AESCipher(object):
key = b'ydsecret://query/key/B*RGygVywfNBwpmBaZg*WT7SIOUP2T0C9WHMZN39j^DAdaZhAnxvGcCY6VYFwnHl'
iv = b'ydsecret://query/iv/C@lZe2YzHtZ2CYgaXKSVfsb7Y4QWHjITPPZ0nQp87fBeJ!Iv6v^6fvi2WN@bYpJ4'
iv = hashlib.md5(iv).digest()
key = hashlib.md5(key).digest()
@staticmethod
def decrypt(data):
# AES解密
cipher = AES.new(AESCipher.key, AES.MODE_CBC, iv=AESCipher.iv)
decrypted = cipher.decrypt(base64.b64decode(data, b'-_'))
unpadded_message = unpad(decrypted, AES.block_size).decode()
return unpadded_message
@staticmethod
def encrypt(plaintext: str):
# AES加密
cipher = AES.new(AESCipher.key, AES.MODE_CBC, iv=AESCipher.iv)
plaintext = plaintext.encode()
padded_message = pad(plaintext, AES.block_size)
encrypted = cipher.encrypt(padded_message)
encrypted = base64.b64encode(encrypted, b'-_')
return encrypted
- heads的确定:通过postman进行确定请求头head的组成
-
- 选中URL,鼠标右键Copy as cURL(bash)
-
- 在postman中进行粘贴URL,测试哪些是有影响的head头:
案例代码
根据以上的操作后,最终代码如下:
import hashlib
import base64
import requests
import json
import time
from urllib.parse import urlencode
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad, pad
class AESCipher(object):
key = b'ydsecret://query/key/B*RGygVywfNBwpmBaZg*WT7SIOUP2T0C9WHMZN39j^DAdaZhAnxvGcCY6VYFwnHl'
iv = b'ydsecret://query/iv/C@lZe2YzHtZ2CYgaXKSVfsb7Y4QWHjITPPZ0nQp87fBeJ!Iv6v^6fvi2WN@bYpJ4'
iv = hashlib.md5(iv).digest()
key = hashlib.md5(key).digest()
@staticmethod
def decrypt(data):
# AES解密
cipher = AES.new(AESCipher.key, AES.MODE_CBC, iv=AESCipher.iv)
decrypted = cipher.decrypt(base64.b64decode(data, b'-_'))
unpadded_message = unpad(decrypted, AES.block_size).decode()
return unpadded_message
@staticmethod
def encrypt(plaintext: str):
# AES加密
cipher = AES.new(AESCipher.key, AES.MODE_CBC, iv=AESCipher.iv)
plaintext = plaintext.encode()
padded_message = pad(plaintext, AES.block_size)
encrypted = cipher.encrypt(padded_message)
encrypted = base64.b64encode(encrypted, b'-_')
return encrypted
def get_form_data(sentence, from_lang, to_lang):
"""
构建表单参数
:param :sentence:翻译内容
:param from_lang:源语言
:param to_lang:目标语言
:return:
"""
e = 'fsdsogkndfokasodnaso'
d = 'fanyideskweb'
u = 'webfanyi'
m = 'client,mysticTime,product'
p = '1.0.0'
b = 'web'
f = 'fanyi.web'
t = time.time()
query = {
'client': d,
'mysticTime': t,
'product': u,
'key': e
}
# 获取sign值 - -密钥值
h = hashlib.md5(urlencode(query).encode('utf-8')).hexdigest()
form_data = {
'i': sentence,
'from': from_lang,
'to': to_lang,
'domain': 0,
'dictResult': 'true',
'keyid': u,
'sign': h,
'client': d,
'product': u,
'appVersion': p,
'vendor': b,
'pointParam': m,
'mysticTime': t,
'keyfrom': f
}
return form_data
def translate(sentence, from_lang='auto', to_lang=''):
"""
:param sentence:需翻译的句子
:param from_lang:源语言
:param to_lang:目标语言
:return:
"""
# 有道翻译网页请求参数
url = 'https://dict.youdao.com/webtranslate'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'referer': 'https://fanyi.youdao.com/',
'cookie': 'OUTFOX_SEARCH_USER_ID=-805044645@10.112.57.88; OUTFOX_SEARCH_USER_ID_NCOO=818822109.5585971;'
}
params = get_form_data(sentence, from_lang, to_lang)
try:
res = requests.post(url, headers=headers, data=params)
# 翻译结果进行AES解密
cipher = AESCipher
ret = json.loads(cipher.decrypt(res.text))
tgt = ret['translateResult'][0][0]['tgt']
return tgt
except Exception as e:
print('翻译失败:', e)
return '翻译失败:' + sentence
if __name__ == '__main__':
word = input("请输入你要翻译的文字: ")
# result = translate(word)
result = translate(word, 'zh-CHS', 'ja')
print('翻译结果:\n', result)
爬取有道翻译--旧版
案例代码
爬取有道翻译--旧版,代码如下:
import requests
def translate(content):
"""
默认汉译英-英译汉的翻译
:param content:要翻译的文本
:return: 返回翻译后的结果
"""
url = 'https://fanyi.youdao.com/translate'
data = {'doctype': 'json',
'i': content}
try:
r = requests.get(url, params=data)
res_json = r.json()
tgt = res_json['translateResult'][0][0]['tgt']
return tgt
except Exception as e:
print('翻译失败:', e)
return '翻译失败:' + content
if __name__ == '__main__':
word = input("请输入需翻译的文字: ")
res = translate(word)
print('翻译结果:\n', res)
总结
网站代码运行流程: