python自动下载论文_利用爬虫批量下载论文(python+selenium)

# coding=utf-8

import time

import csv

import os

from selenium import webdriver

from pyvirtualdisplay import Display

# 得到每一期主页的网址

def get_issue_url(begin_year,end_year):

issue_url_list=[]

for j in range(begin_year,end_year):

year_num=str(j)

VOL_num=str(j-1932)

for i in range(1,7):

issue_num=str(i)

issue_url='http://onlinelibrary.wiley.com/doi/10.1111/ecta.%s.%s.issue-%s.x/issuetoc'%(year_num,VOL_num,issue_num)

issue_url_list.append(issue_url)

return issue_url_list[:-3]

# 得到一期内每个文章(非评论类文章)网址

def get_pdf_url(issue_url):

driver.get(issue_url)

time.sleep(5)

Issue_num = driver.find_elements_by_xpath('//span[@class="issueTocIssue"]')[0].text

Vol_num = driver.find_elements_by_xpath('//span[@class="issueTocVolume"]')[0].text

year_num = driver.find_elements_by_xpath('//h2[@class="noMargin"]')[0].text[-4:]

one_issue_pdf_url_list = []

one_issue_file_title_list=[]

one_issue_title_list=[]

one_issue_page_list=[]

for link in driver.find_elements_by_xpath('//*[@id="group2"]/ol/li/div/a'):

url = link.get_attribute('href')[:-4]+'pdf'

title,page=link.text[:-1].split(' (')

file_title=year_num+'-'+Issue_num+'-'+Vol_num+'-'+page

one_issue_pdf_url_list.append(url)

one_issue_file_title_list.append(file_title)

one_issue_title_list.append(title)

one_issue_page_list.append(page)

print file_title

one_issue_dl_file_name_list=[]

for url in one_issue_pdf_url_list:

# wiley上econometrica的文章,网址末尾和下载下来的文件名不同,所以需要爬两次,有的期刊则不用,比如JF

full_url = url[:-3]+'full'

driver.get(full_url)

try:

link= driver.find_elements_by_xpath('//li[@class="article-header__references-item"]')[0]

dl_file_name=link.get_attribute('id').split('-')[0].lower()

except:

dl_file_name='ecta'+str(int(dl_file_name[4:])+1)

print 'Error! No ecta number!'

print dl_file_name

one_issue_dl_file_name_list.append(dl_file_name)

return one_issue_pdf_url_list,one_issue_dl_file_name_list,one_issue_file_title_list,one_issue_title_list,one_issue_page_list

# pdf下载函数

def download_pdf(driver,pdf_url):

display=Display(visible=0,size=(800,600))

display.start()

driver.get(pdf_url)

time.sleep(3)

display.stop()

# 保存csv函数

def save_csv(data,first_row):

csvfile = file('/Users/your_path/title.csv', 'wb')

writer = csv.writer(csvfile)

writer.writerow(first_row)

writer.writerows(data)

csvfile.close()

if __name__=='__main__':

# 定义driver

options = webdriver.ChromeOptions()

prefs = {

"download.prompt_for_download": False,

'download.default_directory': '/Users/your_path',

"plugins.always_open_pdf_externally": True

}

options.add_experimental_option('prefs', prefs)

driver = webdriver.Chrome(executable_path='/Users/your_path/chromedriver/chromedriver',

chrome_options=options)

# 爬取URL网址

begin_year=2015

end_year=2018

issue_url_list=get_issue_url(begin_year,end_year)

pdf_url_list=[]

dl_file_name_list=[]

file_title_list=[]

title_list=[]

page_list=[]

for issue_url in issue_url_list:

one_issue_pdf_url_list, one_issue_dl_file_name_list,one_issue_file_title_list, one_issue_title_list, one_issue_page_list=get_pdf_url(issue_url)

pdf_url_list=pdf_url_list+one_issue_pdf_url_list

dl_file_name_list=dl_file_name_list+one_issue_dl_file_name_list

file_title_list = file_title_list+one_issue_file_title_list

title_list = title_list+one_issue_title_list

page_list = page_list+one_issue_page_list

# 转码并保存为csv

for i in range(len(pdf_url_list)):

pdf_url_list[i]=pdf_url_list[i].encode('utf8')

dl_file_name_list[i]=dl_file_name_list[i].encode('utf8')

file_title_list[i]=file_title_list[i].encode('utf8')

title_list[i]=title_list[i].encode('utf8')

page_list[i]=page_list[i].encode('utf8').replace('ü','_')

data = zip(pdf_url_list,dl_file_name_list,file_title_list, title_list, page_list)

save_csv(data, ['pdf_url','download_file_name','file_name','paper_title','page'])

# 下载pdf

data=csv.reader(open('/Users/your_path/title.csv'))

data_list=[]

for row in data:

data_list.append(row)

pdf_url_list,dl_file_name_list,file_new_name_list,paper_name_list,page_list=map(list, zip(*data_list[1:]))

disk_name_list = os.listdir('/Users/your_path')

for i in range(len(pdf_url_list)):

# 防止重复下载

file_name=dl_file_name_list[i]+'.pdf'

if file_name not in disk_name_list:

url=pdf_url_list[i]

download_pdf(driver, url)

# 重命名

disk_name_list = os.listdir('/Users/your_path')

for old_name in disk_name_list:

old_name=old_name.split('.')[0]

if old_name in dl_file_name_list:

new_name=file_new_name_list[dl_file_name_list.index(old_name)]

os.rename('/Users/your_path' + old_name+'.pdf', '/Users/your_path'+ new_name + '.pdf')

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值