最近在看NeurIPS的文章,但是一篇篇下载太繁琐,希望能快速批量下载下来。
于是想到了之前一直听说的python爬虫,初次学着弄一下。
用到了requests
,BeautifulSoup
,urllib.request
包
先放最终运行的程序:
结果程序
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import os
BASE_URL = 'https://proceedings.neurips.cc/'
# 打开网站并下载
def openAndDownload(url, title):
str_subhtml = requests.get(url)
soup1 = BeautifulSoup(str_subhtml.text, 'lxml')
subdata = soup1.select('body > div.container-fluid > div > div > a:nth-child(4)')
# print('subdata:', subdata)
downloadUrl = BASE_URL + subdata[0].get('href')
print(downloadUrl)
getFile(downloadUrl, title)
# 下载文件
def getFile(url, title):
title = replaceIllegalStr(title)
filename = title + '.pdf'
urlretrieve(url, './essay/%s' % filename.split('/')[-1])
print("Sucessful to download " + title)
# 替换非法命名字符
def replaceIllegalStr(str):
str = str.replace(':', '')
str = str.replace('?', '')
str = str.replace('/', '')
str = str.replace('\\', '')
return str
def main():
url = 'https://proceedings.neurips.cc/paper/2020'
strhtml = requests.get(url)
soup = BeautifulSoup(strhtml.text, 'lxml')
data = soup.select('body > div.container-fluid > div > ul > li > a')
list = []
for item in data:
list.append([item.get_text(), item.get('href')])
name = ['title', 'link']
test = pd.DataFrame(columns=name,