每年CVPR总是要看不少papers,于是,不如把所有papers都下载下来,再一一筛选,免去了在线查找的麻烦。So,下载就是简单的不能再简单的爬虫程序,毕竟,山不在高,有仙则名,水不在深,有龙则灵,code不在全,能用就行!
#!/usr/bin/env python
# coding=utf-8
import urllib
import urllib2
import re
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def download_file(download_url,file_name, count):
response = urllib2.urlopen(download_url)
file = open(file_name, 'w')
file.write(response.read())
file.close()
print("Completed" + str(count).zfill(4))
save_path = '/home/nick/cvpr2017/' # New folder
url = 'http://openaccess.thecvf.com/CVPR2017.py'
html = getHtml(url)
parttern = re.compile(r'\bcontent_cvpr_2017.*paper\.pdf\b')
url_list = parttern.findall(html)
print len(url_list) # Should be 783
count = 0
breakpoint = 0
for url in url_list:
count += 1
if count>breakpoint: # Sometime there is timeout wrong, So we need to continue to download from the breakpoint
name = url.split('/')[-1]
file_name = save_path + name
download_file('http://openaccess.thecvf.com/'+url,file_name, count)