笔者环境:win10 + python3.9 + requests 2.25.1 + urllib3 1.25.8 + selenium 3.141.0 + chromedriver.exe 90.0.4430.212
目前通过 python 调用免费的谷歌机器翻译,据我测试有两种相对简单的可行方法:
- 调用免费翻译接口,优点是速度快,缺点是次数有限制并且需要挂 VPN,目前为止测试是单个 IP 查个几十次就会被限制
- selenium 大法,模拟浏览器请求和抓取结果,优点是不需要挂 VPN,缺点是稍微慢点,而且单次查询有 5000 字符限制,所以我在代码里做了自动截断,不过这里留了个坑后面填,需要优化为按临近 5000 字符的前一个断句,来做语义截断,同时需要注意谷歌浏览器和驱动的版本匹配
好了,下面直接上代码吧~:
# -*- coding:utf-8 -*-
import requests
from urllib.parse import urlencode
from selenium import webdriver
def translate(text, target_language="zh-CN"):
"""
请求谷歌翻译接口,次数过多会被限制访问,大概在20次左右
:param text: 待翻译内容
:param target_language: 目标语言
:return: 翻译后结果
"""
result = ""
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "换成自己的"
}
if text and isinstance(text, str) and text.strip() != "":
url = "https://translate.google.com/translate_a/single?"
params = {"client": "gtx", "sl": "auto", "tl": target_language, "dt": "t", "ie": "UTF-8", "oe": "UTF-8",
"q": text}
try:
r = requests.get(url + urlencode(params), headers=header)
if r.status_code == 200:
# 拼接语义分割的各部分
for item in r.json()[0]:
result += item[0]
except:
pass
return result
def translate_with_webdriver(text, target_language="zh-CN"):
"""
浏览器模拟谷歌翻译
:param text: 待翻译内容
:param target_language: 目标语言
:return: 翻译后结果
"""
result = ""
if text and isinstance(text, str) and text.strip() != "":
# 超过5000字符需要多次翻译
if len(text) > 5000:
count = len(text) // 5000 + 1
temp_result = ""
for i in range(0, count):
temp_text = text[5000 * i:5000 * (i + 1)]
temp_result += translate_with_webdriver(temp_text, target_language)
return temp_result
url = f"https://translate.google.cn/?"
params = {"sl": "auto", "tl": target_language, "op": "translate", "text": text}
try:
# 模拟浏览器登录
options = webdriver.ChromeOptions()
# 关闭可视化
options.add_argument('--headless')
# 关闭图片视频加载
options.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(r'D:\projects\Spider\chromedriver.exe', options=options)
driver.get(url + urlencode(params))
element = driver.find_element_by_xpath(
"//div[1]/div[2]/c-wiz[1]/div[2]/c-wiz[1]/div[1]/div[2]/div[2]/c-wiz[2]/div[5]/div[1]/div[1]/span[1]/span[1]")
soup = BeautifulSoup(element.get_attribute('innerHTML'), "html.parser")
driver.quit()
result = soup.find_all("span")[0].string
except:
pass
return result
# 支持翻译语言对照表
support_lauguage = {'afrikaans': 'af', 'arabic': 'ar', 'belarusian': 'be', 'bulgarian': 'bg', 'catalan': 'ca',
'czech': 'cs', 'welsh': 'cy', 'danish': 'da', 'german': 'de', 'greek': 'el', 'english': 'en',
'esperanto': 'eo', 'spanish': 'es', 'estonian': 'et', 'persian': 'fa', 'finnish': 'fi',
'french': 'fr', 'irish': 'ga', 'galician': 'gl', 'hindi': 'hi', 'croatian': 'hr', 'hungarian': 'hu',
'indonesian': 'id', 'icelandic': 'is', 'italian': 'it', 'hebrew': 'iw', 'japanese': 'ja',
'korean': 'ko', 'latin': 'la', 'lithuanian': 'lt', 'latvian': 'lv', 'macedonian': 'mk',
'malay': 'ms', 'maltese': 'mt', 'dutch': 'nl', 'norwegian': 'no', 'polish': 'pl',
'portuguese': 'pt', 'romanian': 'ro', 'russian': 'ru', 'slovak': 'sk', 'slovenian': 'sl',
'albanian': 'sq', 'serbian': 'sr', 'swedish': 'sv', 'swahili': 'sw', 'thai': 'th', 'filipino': 'tl',
'turkish': 'tr', 'ukrainian': 'uk', 'vietnamese': 'vi', 'yiddish': 'yi', 'chinese_simplified': 'zh-CN',
'chinese_traditional': 'zh-TW', 'auto': 'auto'}
有问题欢迎评论区交流,持续更新 Q&A,如果觉得对你有帮助,请点赞让更多的人看到谢谢~
Q&A:
- 谷歌浏览器驱动的下载:
谷歌浏览器驱动各版本地址
——————————————————————————————
更多常用工具脚本、自动化脚本、封装方法积累,请移步:https://github.com/Joy917/ToolScripts