A script for download all papers on ICML conference

A script

# coding=utf-8
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup
import traceback
import re
import os
import pdb

prefix = 'http://proceedings.mlr.press/v80/'
save_dir = 'icml2018'

def get_pdf(data):
    href, title = data
    name = re.sub(r'[\\/:*?"<>|\bx0\u2019\u2014\xb0\u2013]', ' ', title) 
    if os.path.isfile(save_dir+"/icml18-%s.pdf" % name):
        print("File already exsists, skip %s" % name)
        return
    try:
        content = requests.get(href).content
        with open(save_dir+"/icml18-%s.pdf" % name, 'wb') as f:  # You may change to "path/to/your/folder"
            f.write(content)
        print("Finish downloading %s" % title)
    except:
        print('Error when downloading %s' % href)
        print(traceback.format_exc())
        
pool = Pool(100)
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
html = requests.get(prefix).content
soup = BeautifulSoup(html, "lxml")
a_list = soup.findAll("p", {"class": "links"})
title_list = soup.findAll("p", {"class": "title"})
title_list = [_.text for _ in title_list]
pdf_list = []
for everya in a_list:
    if everya.contents[3].text == "Download PDF":
        href = everya.contents[3].get("href")
        pdf_list.append(href)
assert len(pdf_list) == len(title_list), "numbers of title and pdf not euqal"
print("Find %d papers" % len(pdf_list))
pool.map(get_pdf, zip(pdf_list, title_list))
print("Find %d papers" % len(pdf_list))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值