简单请求:
from urllib.request import urlopen
html = urlopen("http://pythonscraping.com/pages/page1.html")
print(html)
print(html.read())
使用BeautifulSoup解析:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://pythonscraping.com/pages/page1.html")
bs_obj = BeautifulSoup(html.read(), "html.parser")
print(bs_obj.h1)
请求异常处理,和标签不存在的处理:
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
def get_title(url):
try:
html = urlopen(url)
except (HTTPError, URLError) as e:
return None
try:
bs_obj = BeautifulSoup(html.read(), "html.parser")
title = bs_obj.body.h1
except AttributeError as e:
return None
return title
title = get_title("http://pythonscraping.com/pages/page1.html")
if title is None:
print("Title could not be found")
else:
print(title)
findAll()初识:get_text()会清除标签返回包含文本的字符串
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://pythonscraping.com/pages/warandpeace.html")
bs_obj = BeautifulSoup(html, "html.parser")
name_list = bs_obj.findAll("span", {"class": "green"})
for name in name_list:
print(name.get_text())
find()和findAll()
- name为标签名或者标签名组成的集合
- attrs为标签属性及对应的值组成的字典
- recursive为递归查找一级标签的子标签
- text根据文本内容查找
- limit限制查找个数,find为findAll的limit=1时
- **kwargs为查找特定属性的标签(class属性由于关键字冲突可改为class_)
findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
BeautifulSoup对象
- BeautifulSoup对象
- 标签Tag对象
- NavigableString对象----表示标签里的文本
- Comment对象----用来查找HTML文档的注释标签
处理子标签,兄弟标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://pythonscraping.com/pages/page3.html")
bs_obj = BeautifulSoup(html, "html.parser")
for child in bs_obj.find("table", {"id": "giftList"}).children:
print(child)
for sibling in bs_obj.find("table", {"id": "giftList"}).tr.next_siblings:
print(sibling)
父标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://pythonscraping.com/pages/page3.html")
bs_obj = BeautifulSoup(html, "html.parser")
print(bs_obj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
正则表达式和BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen("http://pythonscraping.com/pages/page3.html")
bs_obj = BeautifulSoup(html, "html.parser")
images = bs_obj.findAll("img", {"src": re.compile("\.\./img/gifts/img.*\.jpg")})
for image in images:
print(image["src"])