上代码:
# coding=utf-8
import urllib,urllib.request
from fake_useragent import UserAgent
import json
import time
import hashlib
import urllib.parse
import requests
import random
import csv,re
class YouDaoFanyi:
def __init__(self, appKey, appSecret):
self.url = 'https://openapi.youdao.com/api/'
self.headers = { 'User-Agent':str(UserAgent().random)}
self.appKey = appKey # 应用id
self.appSecret = appSecret # 应用密钥
self.langFrom = 'EN' # 翻译前文字语言,auto为自动检查
self.langTo = 'zh-CHS' # 翻译后文字语言,auto为自动检查
def getUrlEncodedData(self, queryText):
'''
将数据url编码
:param queryText: 待翻译的文字
:return: 返回url编码过的数据
'''
salt = '2' # 产生随机数 ,其实固定值也可以,不如"2"
sign_str = self.appKey + queryText + salt + self.appSecret
sign_str=sign_str.encode('utf-8')
sign = hashlib.md5(sign_str).hexdigest()
payload = {
'q': queryText,
'from': self.langFrom,
'to': self.langTo,
'appKey': self.appKey,
'salt': salt,
'sign': sign
}
# 注意是get请求,不是请求
data = urllib.parse.urlencode(payload)
return data
def parseHtml(self, html):
'''
解析页面,输出翻译结果
:param html: 翻译返回的页面内容
:return: None
'''
data = json.loads(html)
print ('-------------------------')
translationResult = data['translation']
if isinstance(translationResult, list):
translationResult = translationResult[0]
print (translationResult)
return translationResult
def translate(self, queryText):
data = self.getUrlEncodedData(queryText) # 获取url编码过的数据
target_url = self.url + '?' + data # 构造目标url
# request = urllib2.Request(target_url, headers=self.headers) # 构造请求
ip_list=get_ip_list()
proxies=get_random_ip(ip_list)
print('随机ip为:'+str(proxies))
req = requests.get(target_url,proxies=proxies, headers=self.headers) # 构造请求
# with request.urlopen(request) as response111: # 发送请求
req.encoding='utf-8'
html=req.text
translationResult=self.parseHtml(html) # 解析,显示翻译结果
return translationResult
#功能:读取文件并处理
def read_file(filepath):
reader=[]
with open(filepath,'r',encoding='utf-8') as csvfile:
spanreader = csv.reader(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)
for row in spanreader:
if row:
reader.append(row)
return reader
#功能:将爬取到的内容写入文件
#注意事项:写文件时open中要加上newline='',否则写一行后程序会自动换行
def write_file(filepath,row):
with open(filepath,'a+',encoding='utf-8',newline='') as csvfile:
spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)
spanreader.writerow(row)
#获取IP列表并检验IP的有效性
def get_ip_list():
f=open('IP.txt','r')
ip_list=f.readlines()
f.close()
return ip_list
#从IP列表中获取随机IP
def get_random_ip(ip_list):
proxy_ip = random.choice(ip_list)
proxy_ip=proxy_ip.strip('\n')
proxies = {'http': proxy_ip}
return proxies
if __name__ == "__main__":
print('程序开始运行!')
appKey = '应用id' # 应用id
appSecret = '应用密钥' # 应用密钥
fanyi = YouDaoFanyi(appKey, appSecret)
reader=read_file('E_baiduBaike_notHaveChinese.csv')
for row in reader:
print('现在翻译的人名是:'+row[0])
translationResult=fanyi.translate(row[0])
print('翻译结果为:'+str(translationResult))
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
if zhPattern.search(translationResult):
row[6]=translationResult
write_file('经有道翻译处理后的文件/E_baiduBaike_youdaoChinese.csv',row)
print('爬取完成')