# coding=utf-8from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup
import traceback
import re
import os
import pdb
prefix ='http://proceedings.mlr.press/v80/'
save_dir ='icml2018'defget_pdf(data):
href, title = data
name = re.sub(r'[\\/:*?"<>|\bx0\u2019\u2014\xb0\u2013]',' ', title)if os.path.isfile(save_dir+"/icml18-%s.pdf"% name):print("File already exsists, skip %s"% name)returntry:
content = requests.get(href).content
withopen(save_dir+"/icml18-%s.pdf"% name,'wb')as f:# You may change to "path/to/your/folder"
f.write(content)print("Finish downloading %s"% title)except:print('Error when downloading %s'% href)print(traceback.format_exc())
pool = Pool(100)ifnot os.path.exists(save_dir):
os.mkdir(save_dir)
html = requests.get(prefix).content
soup = BeautifulSoup(html,"lxml")
a_list = soup.findAll("p",{"class":"links"})
title_list = soup.findAll("p",{"class":"title"})
title_list =[_.text for _ in title_list]
pdf_list =[]for everya in a_list:if everya.contents[3].text =="Download PDF":
href = everya.contents[3].get("href")
pdf_list.append(href)assertlen(pdf_list)==len(title_list),"numbers of title and pdf not euqal"print("Find %d papers"%len(pdf_list))
pool.map(get_pdf,zip(pdf_list, title_list))print("Find %d papers"%len(pdf_list))