1.安装BeautifulSoup模块
pip install beautifulsoup4
2.文件中引入
from bs4 import BeautifulSoup
3.使用BeautifulSoup
with open('./tests/python.html',encoding='utf-8') as f:
texts = f.read()
bs = BeautifulSoup(texts,'html.parser')
print(bs.title)
# 获取节点文本
print(bs.title.text)
# 获取节点名称
print(bs.title.name)
# 取父节点名称
print(bs.title.parent.name)
# 取出所有的子节点
print(bs.p.children)
print(list(bs.p.children))
# 获取节点的属性
print(bs.p['class'])
# 取出所有指定节点
print(bs.find_all('a'))
links = bs.find_all('a')
for link in links:
print(link['href'])
print(bs.find('a'))
# 按条件查找对象
# 查找p标签中class为titile的标签 因为class是python的关键字,所以要加个下划线class_ find返回时对象
print(bs.find('p',class_ = 'title'))
print(bs.find('p',id = 'title1'))
# 查找所有class为title的标签 find_all 返回时列表
print(bs.find_all(class_='title'))
# 取得文档内的所有文本内容
print(bs.get_text())
# 工整地打印整个代码
print(bs.prettify())