功能描述
虽然各翻译网站都提供了各自的API,但是要么付费,要么有字符数量限制。我也尝试过用post来获取,但也不是抓包那么简单的,还需要一番计算。本例子用的python3.7+selenium+火狐,用chrome会有些问题,最好别用。本例的优势就是开发快,缺点就是——慢。但你们后续可以用多线程来弥补。
本例只翻译第一页,然后另存为"b.xls"没有保留原格式,可以以后复制选择性粘贴到原文件里。
文件夹列表
- pytest_translater
- firefox_profile
- index
我的working directory是pytest_translater,请根据你自己的项目设计
代码1 <translator_selenium.py>
分别用了有道和bing网页,bing因为我不会抓取结果元素的文本,所以直接用了它的复制按钮。参考了好多贴子, 就不一一列举了,请自己搜索selenium教程
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
import win32clipboard
import win32con
from bs4 import BeautifulSoup
import time
def get_clip():
win32clipboard.OpenClipboard()
d = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
win32clipboard.CloseClipboard()
return d
class translator_youdao:
def __init__(self):
self.fireprofile_path = r"firefox_profile" # 填写实际路径
self.browser = webdriver.Firefox(firefox_profile=self.fireprofile_path)
self.browser.get('http://fanyi.youdao.com/')
self.input_element = '//*[@id="inputOriginal"]' # 可能随网页升级变化
self.output_element = '/html/body/div[2]/div[2]/div[2]/div[1]/div[3]/p/span' # 可能随网页升级变化
self.input = ''
self.output = ''
def translate(self, text='hello world!'):
def is_sentence(tag):
return tag.has_attr('data-sentence')
def is_section(tag):
return tag.has_attr('data-section') and (not tag.has_attr('data-sentence'))
# 如果和上次一样,则直接返回上次结果
if text == self.input:
return self.output
elif text == '':
return ''
else:
self.input = text
def combine(sec):
output = ''
for section in sec.find_all(is_section):
for sentence in section.find_all(is_sentence):
output += sentence.text
output += '\n'
output = output[:-1]
return output
# 输入框填写文字
self.browser.find_element_by_xpath(self.input_element).clear()
self.browser.find_element_by_xpath(self.input_element).send_keys(self.input)
while True:
try:
# 第一次运行时等待刷出翻译框
WebDriverWait(self.browser, 2).until(EC.presence_of_element_located((By.XPATH, self.output_element)))
break
except:
pass
while True:
self.soup = BeautifulSoup(self.browser.page_source, features="html5lib")
if self.output != combine(self.soup):
self.output = combine(self.soup)
break
# 等待页面刷出结果
time.sleep(0.1)
return self.output
class translator_bing:
def __init__(self):
self.fireprofile_path = r"firefox_profile" # 填写实际路径
self.browser = webdriver.Firefox(firefox_profile=self.fireprofile_path)
# self.browser = webdriver.Chrome()
self.browser.get('http://cn.bing.com/translator/')
# 选择两种语言
language_selector_sr = Select(self.browser.find_element_by_xpath('//*[@id="tta_srcsl"]'))
language_selector_sr.select_by_value('en')
language_selector_tg = Select(self.browser.find_element_by_xpath('//*[@id="tta_tgtsl"]'))
language_selector_tg.select_by_value('zh-Hans')
self.result = ''
self.input_element = '//*[@id="tta_input"]' # Xpath,可能随网页升级变化
self.output_element = '//*[@id="tta_output"]' # Xpath,可能随网页升级变化
self.input = ''
self.output = ''
def translate(self, text='hello world!'):
if text == self.input:
return self.result
elif text == '':
return ''
else:
self.input = text
# bing比较不稳定。有时捕捉不到输入框,所以点击一下再清除。有时输入了但是输出一直等待,所以回车。以下请按调试情况决定是否保留。
self.browser.find_element_by_xpath(self.input_element).click()
self.browser.find_element_by_xpath(self.input_element).send_keys(Keys.BACKSPACE)
self.browser.find_element_by_xpath(self.input_element).clear()
self.browser.find_element_by_xpath(self.input_element).send_keys(self.input)
self.browser.find_element_by_xpath(self.input_element).send_keys(Keys.ENTER)
while True:
# 等待输出框
try:
WebDriverWait(self.browser, 5).until(EC.presence_of_element_located((By.XPATH, self.output_element)))
except:
continue
try:
WebDriverWait(self.browser, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="tta_copyIcon"]')))
self.browser.find_element_by_xpath('//*[@id="tta_copyIcon"]').click() # 复制
if get_clip() == self.output_element:
continue
else:
self.output = get_clip()
break
except:
pass
self.result = str(self.output)
return self.result
if __name__ == '__main__':
# print(get_clip())
tr = translator_youdao()
print(tr.translate('thanks'))
print(tr.translate('Check that color code is correct.Check that surface quality is even, \nno scrathes or rough surface'))
代码2 <excel_rw.py>
主程序代码比较简单,参考了https://blog.csdn.net/csdnnews/article/details/80878945
import re
import xlrd
import xlwt
import datetime
import index.translator_selenium as translator_selenium
def excel_rd(file='a.xlsx'):
table = xlrd.open_workbook(filename=file) # 打开文件
# print(table.sheet_names()) # 获取所有表格名字
#
# sheet1 = table.sheet_by_index(0) # 通过索引获取表格
# sheet2 = table.sheet_by_name('年级') # 通过名字获取表格
# print(sheet1, sheet2)
# print(sheet1.name, sheet1.nrows, sheet1.ncols)
#
# rows = sheet1.row_values(2) # 获取行内容
# cols = sheet1.col_values(3) # 获取列内容
# print(rows)
# print(cols)
# print(sheet1.cell(1, 0).value) # 获取表格里的内容,三种方式
# print(sheet1.cell_value(1, 0))
# print(sheet1.row(1)[0].value)
rows = table.sheet_by_index(0).nrows
cols = table.sheet_by_index(0).ncols
input_data = [['' for i in range(cols)] for i in range(rows)]
output_data = [['' for i in range(cols)] for i in range(rows)]
print('表格读取完毕,共%d行,%d列' %(rows, cols))
for row in range(rows):
for col in range(cols):
input_data[row][col] = table.sheet_by_index(0).cell(row, col).value
# print(table.sheet_by_index(0).cell(row, col).value)
return input_data, output_data
def excel_wt(data, file='b.xls'):
# 设置单元格样式
style = xlwt.XFStyle() # 初始化样式
# font = xlwt.Font() # 为样式创建字体
# style.font = font # 设定样式
style.alignment.wrap = 1 # 自动换行
f = xlwt.Workbook()
sheet1 = f.add_sheet('sheet1_Chs', cell_overwrite_ok=True)
for i in range(len(data)):
for j in range(len(data[i])):
sheet1.write(i, j, data[i][j], style)
f.save(file)
pass
if __name__ == '__main__':
# 计时
t_start = datetime.datetime.now()
# 计数
k = 0
# 读取文件
input_data, output_data = excel_rd()
# 初始化翻译器
translator = translator_selenium.translator_bing()
for i in range(len(input_data)):
for j in range(len(input_data[i])):
if input_data[i][j] == '':
output_data[i][j] = ''
elif input_data[i][j] != '':
print(input_data[i][j])
# 以下为附加功能,如果不需要就注释掉
# 以下try是为了不翻译编号,不需要就删掉,只适用于有一个小数点和没小数点的情况
try:
float(input_data[i][j])
output_data[i][j] = input_data[i][j]
continue
except:
pass
# 直接输出翻译结果
# output_data[i][j] = translator.translate(input_data[i][j])
# 单元格同时显示原文和译文
output_data[i][j] = input_data[i][j] + '\n' + translator.translate(input_data[i][j])
# 删除多余空格
strinfo = re.compile(' ')
output_data[i][j] = strinfo.sub('', output_data[i][j])
print(output_data[i][j])
k += 1
excel_wt(output_data)
t_consume = datetime.datetime.now() - t_start
print('共耗时%s,翻译了%d条' % (t_consume, k))
print('completed')
效果
翻译过程示例:
Visual inspection
Visual inspection
目视检查
Check that coatings of the parts are as specified in mechanical drawings and coating quality is equal
Check that coatings of the parts are as specified in mechanical drawings and coating quality is equal
检查零件的涂层是否符合机械图纸规定,涂层质量是否一致
0.5
Ball screw and conveyor assembly to welded frame
Ball screw and conveyor assembly to welded frame
滚珠丝杠和输送装置组装成焊接框架
Instructions
Instructions
指令
翻译速度大约1.5秒1条吧,主要瓶颈是网速和电脑性能。下面之所以接近2秒是因为垃圾电脑打开浏览器太慢了,耗费20多秒。
共耗时0:01:48.633000,翻译了57条
completed
截图
就是这样,这是我第一个博客,比较粗糙,代码也不美观,结构也不科学 ,效率也不高。但就是开发时间短。毕竟我也不是专业程序员。就是复制粘贴而已。请给我鼓励,后续我会继续写ppt和word的整篇翻译。
话说谁能告诉我为啥文章目录不显示代码2?