import PyPDF2
import requests
import json
from hashlib import md5
import random
import re
import time
ori_lan='en'
to_lan='zh'
ori_url='http://api.fanyi.baidu.com'
path='/api/trans/vip/translate'
url=ori_url+path
appid = 'your_appid' # 替换为你在百度翻译开放平台申请的应用ID
appkey = 'your_keyid' # 替换为你在百度翻译开放平台申请的密钥
def get_pdf_text(path):
reader=PyPDF2.PdfReader(path)
text_page=[]
title=reader.metadata.title
for id,i in enumerate(reader.pages):
text_page.append(i.extract_text())
return [title,text_page]
def make_sign_by_md5(s,encoding='utf-8'):
return md5(s.encode(encoding)).hexdigest()#得到转化后的哈希值并且转化为十六进制的字符串
def translate(text:str):
text=text.strip(' ')
if(text==''):
return ''
print(text)
salt=random.randint(32768,65536)
sign=make_sign_by_md5(appid+text+str(salt)+appkey)
headers={'Content-Type':'application/x-www-form-urlencoded'}
payload={'appid':appid,'q':text,'from':ori_lan,'to':to_lan,'salt':salt,'sign':sign}
r=requests.post(url,params=payload,headers=headers)
print(r.text)
result=r.json()
return result['trans_result'][0]['dst']
def remove_special_symbols(text):
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'<img.*?>', '', text)
return text
text_set=[i.replace("\n",'') for i in get_pdf_text('test.pdf')[1]]
for i in range(0,len(text_set)):
text_set[i]=remove_special_symbols(text_set[i])
print(text_set)
trans_text=[]
i=0
for text in text_set:
print(i)
i+=1
trans_text.append(translate(text))
time.sleep(1)
with open('test.txt','w')as f:
for i in range(0,len(text_set)):
f.write(trans_text[i]+'\n')
f.close()
使用python实现的pdf翻译脚本
最新推荐文章于 2024-05-30 11:43:13 发布