IEEE爬取摘要并翻译成中文

  1 # -*- coding: utf-8 -*-
  2 """
  3 Created on Thu Oct 18 09:13:32 2018
  4 
  5 @author: Gawen
  6 
  7 实现爬取IEEE目标网页上该页所有论文的摘要
  8 并通过百度翻译api进行翻译
  9 并将链接与翻译后的摘要存到文本文档中
 10 其中百度的api的appid以及secertkey需要自己申请
 11 每月有200万字符的免费额度
 12 python版本3.6
 13 需要的包:selenium
 14 需要的软件:chrome-driver
 15 """
 16 #example url https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8360187&punumber=8360187&filter=issueId%20EQ%20%228363090%22&pageNumber=9&pageNumber=10
 17 import requests
 18 from bs4 import BeautifulSoup
 19 from selenium import webdriver
 20 import time
 21 from selenium.webdriver.chrome.options import Options
 22 import hashlib
 23 import urllib
 24 import random
 25 import json
 26 
 27 
 28 def writetxt(file,url,abstract):
 29     with open(file, 'a', encoding='GBK') as file_txt:
 30         file_txt.write('链接:\n'+url)
 31         file_txt.write('\n')
 32         file_txt.write('摘要:\n'+abstract)
 33         file_txt.write('\n')
 34         file_txt.write('\n')
 35 
 36 
 37 def trans(q):
 38         appid = ''#你自己的百度翻译appid
 39         secretkey = ''#你自己的百度翻译secretkey
 40         myurl = '/api/trans/vip/translate'
 41         fromLang = 'en'
 42         toLang = 'zh'
 43         salt = random.randint(32768,65536)
 44         sign = appid + q + str(salt) + secretkey
 45         sign = hashlib.md5(sign.encode(encoding = 'utf-8')).hexdigest()
 46         myurl = myurl + '?appid=' + appid + '&q=' + urllib.parse.quote(q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str(salt) + '&sign=' + sign
 47         print(myurl)
 48         try:
 49                 r = requests.get('http://api.fanyi.baidu.com'+myurl)
 50                 print(r.content.decode('utf-8'))
 51         except Exception as e:
 52                 print(e)
 53         html = r.content.decode('utf-8')
 54         soup = BeautifulSoup(html,'lxml')
 55         text = soup.find('p').text
 56         text_dict = json.loads(text)
 57         if ('error_code' in text_dict.keys()):
 58                 return 'error'
 59         return text_dict['trans_result'][0]['dst']
 60 
 61 
 62 url = input('please input the url that you want to download:\n')
 63 fore = 'https://ieeexplore.ieee.org'
 64 r = requests.get(url)
 65 html = r.content.decode('utf-8')
 66 soup = BeautifulSoup(html,'lxml')
 67 h3 = soup.find('div', class_='cf jrnl-results-filter').find_all('h3')
 68 h3text = []
 69 errtitle = []
 70 links = []
 71 for h in h3:
 72     h3text.append(h.text.strip())
 73 print(h3text)
 74 for i in range(len(h3text)):
 75     if ((soup.find('a', attrs={'aria-label':'View HTML:  ' + h3text[i]}))==None):
 76         errtitle.append(h3text[i])
 77         continue
 78     href = (soup.find('a', attrs={'aria-label':'View HTML:  ' + h3text[i]})['href'])
 79     links.append(fore + href)
 80 print(links)
 81 chrome_options = Options()
 82 chrome_options.add_argument('--headless')
 83 chrome_options.add_argument('--disable-gpu')
 84 driver = webdriver.Chrome(chrome_options=chrome_options)
 85 count = 0
 86 for link in links:
 87     driver.get(link)
 88     driver.implicitly_wait(20)
 89     ps = driver.page_source
 90     lsoup = BeautifulSoup(ps,'lxml')
 91     abstract = lsoup.select('body > div > div > div > div > div > div > xpl-root > xpl-document-details > div > div > div > div > section > div > div > xpl-document-abstract > section > div > div > div > div > div')[0].text
 92     abstract = trans(abstract)
 93     if(abstract=='error'):
 94             errtitle.append(link)
 95             continue
 96     writetxt(r'C:\Users\Gawen\Desktop\abstract.txt',link,abstract)#输出路径
 97     count += 1 
 98     print(count)
 99     time.sleep(5)
100 driver.close()
101 print("共有"+str(len(errtitle))+"篇论文下载失败")
102 for err in errtitle:
103     print(err)
104 
105 
106     

 

转载于:https://www.cnblogs.com/gawen4/p/9870330.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值