爬取整个网页
from urllib2 import urlopen
html = urlopen('网页URL')
print(html.read())
使用BeautifulSoup获取网页指定标签
from urllib2 import urlopen
from bs4 import BeautifulSoup
html = urlopen('网页URL')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1) #打印该网页的H1标签
异常处理
from urllib2 import urlopen, HTTPError
try:
html = urlopen('网页URL')
except HTTPError as e: #捕获HTTP异常(404,500)
print(e)
except URLError as e: #捕获服务器异常(Server not found)
print(e)
else:
print(html.read())
BeautifulSoup的find()和findAll()
findAll(tag , attributes , recursive , text , limit , keywords)
find(tag , attributes , recursive , text , keywords)
例: