学术小伙伴每年都有很多顶会论文要追,为了能够获得第一手的顶会论文资源,就用小爬虫爬取CV顶会论文并打包下载,可以说是爬虫在手,天下我有~
爬虫就是request,正则分解网页信息,获取自己感兴趣的元素标签,需要的话就把资源下载下来就ok了,整个程序写得也很简单,直接上代码。
import re# 正则
import requests
import urllib
import os
import pathlib
import filecmp
import shutil
def loadPDF():
# URL 信息
eccvR = requests.get('https://openaccess.thecvf.com/ECCV2018')
eccvData = eccvR.text
print('url is https://openaccess.thecvf.com/ECCV2018')
# 解析页面的pdf元素
link_list = re.findall(r"(?<=href=\").+?pdf(?=\">pdf)|(?<=href=\').+?pdf(?=\">pdf)", eccvData)
name_list = re.findall(r"(?<=href=\").+?2018_paper.html\">.+?</a>", eccvData)
number = 0
cnt = 0
num = len(link_list)
print('paper links num=',num)
print('paper name num=',len(name_list))
# 存储pdf的本地文档
localDir = 'C:\Papers\ECCV2018\\'
print('local Dir is'+localDir)
if not os.path.exists(localDir):
# 创建文档
print('create new local path.')
os.makedirs(localDir)
# 选择前三份pdf文件进行下载
downloadNum = 3
print('Download number is 3')
if downloadNum<1:
print('no Download pdf')
while cnt < downloadNum:
url = link_list[cnt]
# 从url元素解析pdf文件名
file_name = name_list[cnt].split('<')[0].split('>')[1]
# 将文件名中的符号去掉
file_name = file_name.replace(':','_')
file_name = file_name.replace('\"','_')
file_name = file_name.replace('?','_')
file_name = file_name.replace('/','_')
file_name = file_name.replace('&','_')
file_name = file_name.replace('%','_')
file_name = file_name.replace('_',' ')
# 下载PDF文件
print('['+str(cnt)+'/'+str(num)+"] Downloading -> " + localDir + file_name+'.pdf')
try:
urllib.request.urlretrieve('http://openaccess.thecvf.com/'+url, localDir+file_name+'.pdf')
except Exception:
continue
cnt = cnt + 1
print("all download finished")
def main():
print('Please check your system is networking~')
url = 'https://openaccess.thecvf.com/ECCV2018'
loadPDF()
if __name__ =='__main__':
main()