# coding=utf-8
import time
import csv
import os
from selenium import webdriver
from pyvirtualdisplay import Display
# 得到每一期主页的网址
def get_issue_url(begin_year,end_year):
issue_url_list=[]
for j in range(begin_year,end_year):
year_num=str(j)
VOL_num=str(j-1932)
for i in range(1,7):
issue_num=str(i)
issue_url='http://onlinelibrary.wiley.com/doi/10.1111/ecta.%s.%s.issue-%s.x/issuetoc'%(year_num,VOL_num,issue_num)
issue_url_list.append(issue_url)
return issue_url_list[:-3]
# 得到一期内每个文章(非评论类文章)网址
def get_pdf_url(issue_url):
driver.get(issue_url)
time.sleep(5)
Issue_num = driver.find_elements_by_xpath('//span[@class="issueTocIssue"]')[0].text
Vol_num = driver.find_elements_by_xpath('//span[@class="issueTocVolume"]')[0].text
year_num = driver.find_elements_by_xpath('//h2[@class="noMargin"]')[0].text[-4:]
one_issue_pdf_url_list = []
one_issue_file_title_list=[]
one_issue_title_list=[]
one_issue_page_list=[]
for link in driver.find_elements_by_xpath('//*[@id="group2"]/ol/li/div/a'):
url = link.get_attribute('href')[:-4]+'pdf'
title,page=link.text[:-1].split(' (')
file_title=year_num+'-'+Issue_num+'-'+Vol_num+'-'+page
one_issue_pdf_url_list.append(url)
one_issue_file_title_list.append(file_title)
one_issue_title_list.append(title)
one_issue_page_list.append(page)
print file_title
one_issue_dl_file_name_list=[]
for url in one_issue_pdf_url_list:
# wiley上econometrica的文章,网址末尾和下载下来的文件名不同,所以需要爬两次,有的期刊则不用,比如JF
full_url = url[:-3]+'full'
driver.get(full_url)
try:
link= driver.find_elements_by_xpath('//li[@class="article-header__references-item"]')[0]
dl_file_name=link.get_attribute('id').split('-')[0].lower()
except:
dl_file_name='ecta'+str(int(dl_file_name[4:])+1)
print 'Error! No ecta number!'
print dl_file_name
one_issue_dl_file_name_list.append(dl_file_name)
return one_issue_pdf_url_list,one_issue_dl_file_name_list,one_issue_file_title_list,one_issue_title_list,one_issue_page_list
# pdf下载函数
def download_pdf(driver,pdf_url):
display=Display(visible=0,size=(800,600))
display.start()
driver.get(pdf_url)
time.sleep(3)
display.stop()
# 保存csv函数
def save_csv(data,first_row):
csvfile = file('/Users/your_path/title.csv', 'wb')
writer = csv.writer(csvfile)
writer.writerow(first_row)
writer.writerows(data)
csvfile.close()
if __name__=='__main__':
# 定义driver
options = webdriver.ChromeOptions()
prefs = {
"download.prompt_for_download": False,
'download.default_directory': '/Users/your_path',
"plugins.always_open_pdf_externally": True
}
options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(executable_path='/Users/your_path/chromedriver/chromedriver',
chrome_options=options)
# 爬取URL网址
begin_year=2015
end_year=2018
issue_url_list=get_issue_url(begin_year,end_year)
pdf_url_list=[]
dl_file_name_list=[]
file_title_list=[]
title_list=[]
page_list=[]
for issue_url in issue_url_list:
one_issue_pdf_url_list, one_issue_dl_file_name_list,one_issue_file_title_list, one_issue_title_list, one_issue_page_list=get_pdf_url(issue_url)
pdf_url_list=pdf_url_list+one_issue_pdf_url_list
dl_file_name_list=dl_file_name_list+one_issue_dl_file_name_list
file_title_list = file_title_list+one_issue_file_title_list
title_list = title_list+one_issue_title_list
page_list = page_list+one_issue_page_list
# 转码并保存为csv
for i in range(len(pdf_url_list)):
pdf_url_list[i]=pdf_url_list[i].encode('utf8')
dl_file_name_list[i]=dl_file_name_list[i].encode('utf8')
file_title_list[i]=file_title_list[i].encode('utf8')
title_list[i]=title_list[i].encode('utf8')
page_list[i]=page_list[i].encode('utf8').replace('ü','_')
data = zip(pdf_url_list,dl_file_name_list,file_title_list, title_list, page_list)
save_csv(data, ['pdf_url','download_file_name','file_name','paper_title','page'])
# 下载pdf
data=csv.reader(open('/Users/your_path/title.csv'))
data_list=[]
for row in data:
data_list.append(row)
pdf_url_list,dl_file_name_list,file_new_name_list,paper_name_list,page_list=map(list, zip(*data_list[1:]))
disk_name_list = os.listdir('/Users/your_path')
for i in range(len(pdf_url_list)):
# 防止重复下载
file_name=dl_file_name_list[i]+'.pdf'
if file_name not in disk_name_list:
url=pdf_url_list[i]
download_pdf(driver, url)
# 重命名
disk_name_list = os.listdir('/Users/your_path')
for old_name in disk_name_list:
old_name=old_name.split('.')[0]
if old_name in dl_file_name_list:
new_name=file_new_name_list[dl_file_name_list.index(old_name)]
os.rename('/Users/your_path' + old_name+'.pdf', '/Users/your_path'+ new_name + '.pdf')