IEEE爬取摘要并翻译成中文

最新推荐文章于 2024-07-30 01:12:00 发布

weixin_30699465

最新推荐文章于 2024-07-30 01:12:00 发布

阅读量827

点赞数

文章标签： python 爬虫 json

原文链接：http://www.cnblogs.com/gawen4/p/9870330.html

版权

  1 # -*- coding: utf-8 -*-
  2 """
  3 Created on Thu Oct 18 09:13:32 2018
  4 
  5 @author: Gawen
  6 
  7 实现爬取IEEE目标网页上该页所有论文的摘要
  8 并通过百度翻译api进行翻译
  9 并将链接与翻译后的摘要存到文本文档中
 10 其中百度的api的appid以及secertkey需要自己申请
 11 每月有200万字符的免费额度
 12 python版本3.6
 13 需要的包:selenium
 14 需要的软件:chrome-driver
 15 """
 16 #example url https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8360187&punumber=8360187&filter=issueId%20EQ%20%228363090%22&pageNumber=9&pageNumber=10
 17 import requests
 18 from bs4 import BeautifulSoup
 19 from selenium import webdriver
 20 import time
 21 from selenium.webdriver.chrome.options import Options
 22 import hashlib
 23 import urllib
 24 import random
 25 import json
 26 
 27 
 28 def writetxt(file,url,abstract):
 29     with open(file, 'a', encoding='GBK') as file_txt:
 30         file_txt.write('链接:\n'+url)
 31         file_txt.write('\n')
 32         file_txt.write('摘要：\n'+abstract)
 33         file_txt.write('\n')
 34         file_txt.write('\n')
 35 
 36 
 37 def trans(q):
 38         appid = ''#你自己的百度翻译appid
 39         secretkey = ''#你自己的百度翻译secretkey
 40         myurl = '/api/trans/vip/translate'
 41         fromLang = 'en'
 42         toLang = 'zh'
 43         salt = random.randint(32768,65536)
 44         sign = appid + q + str(salt) + secretkey
 45         sign = hashlib.md5(sign.encode(encoding = 'utf-8')).hexdigest()
 46         myurl = myurl + '?appid=' + appid + '&q=' + urllib.parse.quote(q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str(salt) + '&sign=' + sign
 47         print(myurl)
 48         try:
 49                 r = requests.get('http://api.fanyi.baidu.com'+myurl)
 50                 print(r.content.decode('utf-8'))
 51         except Exception as e:
 52                 print(e)
 53         html = r.content.decode('utf-8')
 54         soup = BeautifulSoup(html,'lxml')
 55         text = soup.find('p').text
 56         text_dict = json.loads(text)
 57         if ('error_code' in text_dict.keys()):
 58                 return 'error'
 59         return text_dict['trans_result'][0]['dst']
 60 
 61 
 62 url = input('please input the url that you want to download:\n')
 63 fore = 'https://ieeexplore.ieee.org'
 64 r = requests.get(url)
 65 html = r.content.decode('utf-8')
 66 soup = BeautifulSoup(html,'lxml')
 67 h3 = soup.find('div', class_='cf jrnl-results-filter').find_all('h3')
 68 h3text = []
 69 errtitle = []
 70 links = []
 71 for h in h3:
 72     h3text.append(h.text.strip())
 73 print(h3text)
 74 for i in range(len(h3text)):
 75     if ((soup.find('a', attrs={'aria-label':'View HTML:  ' + h3text[i]}))==None):
 76         errtitle.append(h3text[i])
 77         continue
 78     href = (soup.find('a', attrs={'aria-label':'View HTML:  ' + h3text[i]})['href'])
 79     links.append(fore + href)
 80 print(links)
 81 chrome_options = Options()
 82 chrome_options.add_argument('--headless')
 83 chrome_options.add_argument('--disable-gpu')
 84 driver = webdriver.Chrome(chrome_options=chrome_options)
 85 count = 0
 86 for link in links:
 87     driver.get(link)
 88     driver.implicitly_wait(20)
 89     ps = driver.page_source
 90     lsoup = BeautifulSoup(ps,'lxml')
 91     abstract = lsoup.select('body > div > div > div > div > div > div > xpl-root > xpl-document-details > div > div > div > div > section > div > div > xpl-document-abstract > section > div > div > div > div > div')[0].text
 92     abstract = trans(abstract)
 93     if(abstract=='error'):
 94             errtitle.append(link)
 95             continue
 96     writetxt(r'C:\Users\Gawen\Desktop\abstract.txt',link,abstract)#输出路径
 97     count += 1 
 98     print(count)
 99     time.sleep(5)
100 driver.close()
101 print("共有"+str(len(errtitle))+"篇论文下载失败")
102 for err in errtitle:
103     print(err)
104 
105 
106