用 beautifulsoup 爬了下伯克利大学 programming languages and compilers 的课件
import re
import requests
from bs4 import BeautifulSoup
r = requests.get( "http://inst.eecs.berkeley.edu/~cs164/fa11/lectures/index.html" )
soup = BeautifulSoup( r.text, "html.parser" )
for elem in soup.findAll( name = "a", attrs = { "href" : re.compile( "lecture[0-9]*.pdf" ) } ):
file_name = elem["href"][:-4] + "-" +\
reduce( lambda a, b: a + " " + b,
elem.find_parent().find_previous_sibling().get_text().split( ":" ) ) + ".pdf"
file_url = "http://inst.eecs.berkeley.edu/~cs164/fa11/lectures/" + elem["href"]
file_get = requests.get( file_url, stream = True )
with open( file_name, "wb" ) as f:
for chunk in file_get.iter_content( chunk_size = 1024 ):
if chunk:
f.write( chunk )