翻译要求
- 翻译不限于中英。
- json字符串只翻译值,不翻译键。
- json字符串以文本形式存放,翻译完也要采用文本存放。
问题分析
- json数据在python中本质上是由多个字典组成的列表;如果只有一个字典,那么也可以是字典。
- 重难点在于如何完整地遍历整个json数据。
- 翻译过程直接爬取谷歌翻译。
解决过程
预创建
为了确保整个过程顺利,下面是需要安装的包,所有的包均可采用pip安装:
requests,urllib3,pyexecjs,demjson
翻译爬虫
翻译爬虫,这里选择直接从github上扒一个过来,地址及链接如下:
https://github.com/neverneverendup/Translator
代码如下,为了方便调用,做了些许修改:
# -*- coding:utf-8 -*-
import urllib
import urllib.request
import urllib.parse
import requests
import execjs
class Google():
def __init__(self):
self.lan_dict = {
'中文': 'zh-CN',
'英文': 'en',
'俄文': 'ru',
'法文': 'fr',
'日文': 'ja',
'韩文': 'ko',
'老挝': 'lo',
'缅甸': 'my',
'泰语': 'th',
'越南': 'vi'
}
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
self.url = 'http://translate.google.cn/translate_a/single'
self.session = requests.Session()
self.session.keep_alive = False
def getTk(self, text):
return self.get_ctx().call("TL", text)
def get_ctx(self):
ctx = execjs.compile("""
function TL(a) {
var k = "";
var b = 406644;
var b1 = 3293161072;
var jd = ".";
var $b = "+-a^+6";
var Zb = "+-3^+b+-f";
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var m = a.charCodeAt(g);
128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
e[f++] = m >> 18 | 240,
e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
e[f++] = m >> 6 & 63 | 128),
e[f++] = m & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = RL(a, $b);
a = RL(a, Zb);
a ^= b1 || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return a.toString() + jd + (a ^ b)
};
function RL(a, b) {
var t = "a";
var Yb = "+";
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2),
d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
}
return a
}
""")
return ctx
def buildUrl(self, text, tk, sl, tl):
baseUrl = 'http://translate.google.cn/translate_a/single'
baseUrl += '?client=webapp&' # 这里client改成webapp后翻译的效果好一些 t翻译的比较差 ..
baseUrl += 'sl=auto&'
baseUrl += 'tl=' + str(tl) + '&'
baseUrl += 'hl=zh-CN&'
baseUrl += 'dt=at&'
baseUrl += 'dt=bd&'
baseUrl += 'dt=ex&'
baseUrl += 'dt=ld&'
baseUrl += 'dt=md&'
baseUrl += 'dt=qca&'
baseUrl += 'dt=rw&'
baseUrl += 'dt=rm&'
baseUrl += 'dt=ss&'
baseUrl += 'dt=t&'
baseUrl += 'ie=UTF-8&'
baseUrl += 'oe=UTF-8&'
baseUrl += 'clearbtn=1&'
baseUrl += 'otf=1&'
baseUrl += 'pc=1&'
baseUrl += 'srcrom=0&'
baseUrl += 'ssel=0&'
baseUrl += 'tsel=0&'
baseUrl += 'kc=2&'
baseUrl += 'tk=' + str(tk) + '&'
content = urllib.parse.quote(text)
baseUrl += 'q=' + content
return baseUrl
def getHtml(self, session, url, headers):
try:
html = session.get(url, headers=headers)
return html.json()
except Exception as e:
return None
def translate(self, from_lan, to_lan, text):
tk = self.getTk(text)
url = self.buildUrl(text, tk, from_lan, to_lan)
result = self.getHtml(self.session, url, self.headers)
if result != None:
ans = []
s = ''
for i in result[0]:
if i[0] != None:
s += i[0]
for i in s.split('\n'):
ans.append(i)
return ans
else:
print('谷歌翻译失败')
# self.logger.info('谷歌翻译失败 ')
return ['谷歌翻译失败']
def Use(text):
gg = Google()
# 修改此处即可实现不同语种的翻译
return gg.translate('zh-CN', 'th', text)[0]
if __name__ == '__main__':
text = input("请输入原文:\n")
print(Use(text))
导入数据
导入文本中的json字符串数据。
# 读取文件
file = open("file.txt", encoding="utf-8")
str = file.read()
file.close()
# print(str)
转化对象
将导入的数据转为对象,这里用到了demjson模块,这个模块可以将一些不规则的json字符串(比如键名不是字符串的,键值是单引号的·)转化为规则的json字符串。
# data_json = json.loads(str)
# 不规则json字符串处理,使其可以成为对象
data_json = demjson.decode(str, encoding="utf-8")
print(data_json)
翻译过程
做到只翻译值而不翻译键,而且还要考虑到这个值有可能是一个字典或列表,因此能翻译的有两种情况:在键对应的值为字符串的情况下翻译值;在键对应的值为列表且列表中不含字典的情况下翻译列表中的每个元素,采用的核心方法为递归。
def translation(dic_json):
# 列表
if isinstance(dic_json, list):
# 遍历列表
for i in range(len(dic_json)):
# 当前项为字典
if isinstance(dic_json[i], dict):
for key in dic_json[i]:
# 键所对应的值是列表
if isinstance(dic_json[i][key], list):
# 递归
translation(dic_json[i][key])
# 键所对应的值是字典
elif isinstance(dic_json[i][key], dict):
# 递归
translation(dic_json[i][key])
# 字符串
else:
# 翻译前的值
print(dic_json[i][key])
# 翻译
dic_json[i][key] = GoogleTranslation.Use(dic_json[i][key])
# 翻译后的值
print(dic_json[i][key])
# 休眠,防止爬虫被检测
time.sleep(1.9)
# 字符串
elif isinstance(dic_json[i], str):
print(dic_json[i])
dic_json[i] = GoogleTranslation.Use(dic_json[i])
print(dic_json[i])
# 休眠,防止爬虫被检测
time.sleep(1.9)
# 字典
else:
for key in dic_json:
# 键所对应的值是列表
if isinstance(dic_json[key], list):
# 递归
translation(dic_json[key])
# 键所对应的值是字典
elif isinstance(dic_json[key], dict):
# 递归
translation(dic_json[key])
else:
# 翻译前的值
print(dic_json[key])
# 翻译
dic_json[key] = GoogleTranslation.Use(dic_json[key])
# 翻译后的值
print(dic_json[key])
# 休眠,防止爬虫被检测
time.sleep(1.9)
写入数据
将翻译后的json字符串写入文本
# 写入
with open("file(translation).txt", "w", encoding="utf-8") as fp:
fp.write(json.dumps(data_json, ensure_ascii=False, indent=4))
附录
GoogleTranslation.py
# -*- coding:utf-8 -*-
import urllib
import urllib.request
import urllib.parse
import requests
import execjs
class Google():
def __init__(self):
self.lan_dict = {
'中文': 'zh-CN',
'英文': 'en',
'俄文': 'ru',
'法文': 'fr',
'日文': 'ja',
'韩文': 'ko',
'老挝': 'lo',
'缅甸': 'my',
'泰语': 'th',
'越南': 'vi'
}
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
self.url = 'http://translate.google.cn/translate_a/single'
self.session = requests.Session()
self.session.keep_alive = False
def getTk(self, text):
return self.get_ctx().call("TL", text)
def get_ctx(self):
ctx = execjs.compile("""
function TL(a) {
var k = "";
var b = 406644;
var b1 = 3293161072;
var jd = ".";
var $b = "+-a^+6";
var Zb = "+-3^+b+-f";
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var m = a.charCodeAt(g);
128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
e[f++] = m >> 18 | 240,
e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
e[f++] = m >> 6 & 63 | 128),
e[f++] = m & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = RL(a, $b);
a = RL(a, Zb);
a ^= b1 || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return a.toString() + jd + (a ^ b)
};
function RL(a, b) {
var t = "a";
var Yb = "+";
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2),
d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
}
return a
}
""")
return ctx
def buildUrl(self, text, tk, sl, tl):
baseUrl = 'http://translate.google.cn/translate_a/single'
baseUrl += '?client=webapp&' # 这里client改成webapp后翻译的效果好一些 t翻译的比较差 ..
baseUrl += 'sl=auto&'
baseUrl += 'tl=' + str(tl) + '&'
baseUrl += 'hl=zh-CN&'
baseUrl += 'dt=at&'
baseUrl += 'dt=bd&'
baseUrl += 'dt=ex&'
baseUrl += 'dt=ld&'
baseUrl += 'dt=md&'
baseUrl += 'dt=qca&'
baseUrl += 'dt=rw&'
baseUrl += 'dt=rm&'
baseUrl += 'dt=ss&'
baseUrl += 'dt=t&'
baseUrl += 'ie=UTF-8&'
baseUrl += 'oe=UTF-8&'
baseUrl += 'clearbtn=1&'
baseUrl += 'otf=1&'
baseUrl += 'pc=1&'
baseUrl += 'srcrom=0&'
baseUrl += 'ssel=0&'
baseUrl += 'tsel=0&'
baseUrl += 'kc=2&'
baseUrl += 'tk=' + str(tk) + '&'
content = urllib.parse.quote(text)
baseUrl += 'q=' + content
return baseUrl
def getHtml(self, session, url, headers):
try:
html = session.get(url, headers=headers)
return html.json()
except Exception as e:
return None
def translate(self, from_lan, to_lan, text):
tk = self.getTk(text)
url = self.buildUrl(text, tk, from_lan, to_lan)
result = self.getHtml(self.session, url, self.headers)
if result != None:
ans = []
s = ''
for i in result[0]:
if i[0] != None:
s += i[0]
for i in s.split('\n'):
ans.append(i)
return ans
else:
print('谷歌翻译失败')
# self.logger.info('谷歌翻译失败 ')
return ['谷歌翻译失败']
def Use(text):
gg = Google()
# 修改此处即可实现不同语种的翻译
return gg.translate('zh-CN', 'th', text)[0]
if __name__ == '__main__':
text = input("请输入原文:\n")
print(Use(text))
Demo.py
import GoogleTranslation
import json
import time
import demjson
def translation(dic_json):
# 列表
if isinstance(dic_json, list):
# 遍历列表
for i in range(len(dic_json)):
# 当前项为字典
if isinstance(dic_json[i], dict):
for key in dic_json[i]:
# 键所对应的值是列表
if isinstance(dic_json[i][key], list):
# 递归
translation(dic_json[i][key])
# 键所对应的值是字典
elif isinstance(dic_json[i][key], dict):
# 递归
translation(dic_json[i][key])
# 字符串
else:
# 翻译前的值
print(dic_json[i][key])
# 翻译
dic_json[i][key] = GoogleTranslation.Use(dic_json[i][key])
# 翻译后的值
print(dic_json[i][key])
# 休眠,防止爬虫被检测
time.sleep(1.9)
# 字符串
elif isinstance(dic_json[i], str):
print(dic_json[i])
dic_json[i] = GoogleTranslation.Use(dic_json[i])
print(dic_json[i])
# 休眠,防止爬虫被检测
time.sleep(1.9)
# 字典
else:
for key in dic_json:
# 键所对应的值是列表
if isinstance(dic_json[key], list):
# 递归
translation(dic_json[key])
# 键所对应的值是字典
elif isinstance(dic_json[key], dict):
# 递归
translation(dic_json[key])
else:
# 翻译前的值
print(dic_json[key])
# 翻译
dic_json[key] = GoogleTranslation.Use(dic_json[key])
# 翻译后的值
print(dic_json[key])
# 休眠,防止爬虫被检测
time.sleep(1.9)
if __name__ == '__main__':
# 读取文件
file = open("file.txt", encoding="utf-8")
str1 = file.read()
file.close()
# print(str)
# data_json = json.loads(str)
# 不规则json字符串处理,使其可以成为对象
data_json = demjson.decode(str1, encoding="utf-8")
print(data_json)
# 翻译
translation(data_json)
# 写入
with open("file1.txt", "w", encoding="utf-8") as fp:
fp.write(json.dumps(data_json, ensure_ascii=False, indent=4))