https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
# coding: utf-8
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# print(soup.prettify())
print(soup.title) # 直接获取title标签内容
print(soup.body.b) # soop.p.b 获取b标签内容
print(soup.find_all('a')) # 返回所有a标签的列表
'''
for item in soup.find_all('a'):
print(item['href'])
print(item.get('href')) # 获取a标签内的所有url
'''
# print(soup.contents) # 返回一个列表 整个html内容
# print(len(soup.contents)) # 2
# print(soup.get_text()) # 获取标签里面的内容,不包含标签
print(soup.title.parent) # 打印head标签<head><title>The Dormouse's story</title></head>
'''
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.b.next_sibling) # <c>text2</c> 兄弟节点(父节点相同)
sibling_soup.c.previous_sibling
# <b>text1</b>
'''
print(soup.find_all(['a', 'b']))
print(soup.find_all(id='link1'))
print(soup.find_all('a', class_='sister'))
print(soup.find_all("a", attrs={"class": "sister"}))
# soup.find_all("a", limit=2)
# select
print(soup.select('title')) # [<title>The Dormouse's story</title>]
print(soup.select('body a')) # 逐层查找
print(soup.select('html head title')) # [<title>The Dormouse's story</title>]
# 找到某个标签下的直接子标签
print(soup.select('html > title')) # []
print(soup.select('head > title')) # [<title>The Dormouse's story</title>]
print(soup.select('p > #link1')) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# 通过CSS类名查找
print(soup.select('.sister'))
print(soup.select('[class~=sister]'))
# 通过id查找
print(soup.select('#link2')) # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
print(soup.select('a#link1'))
# soup.select("#link1,#link2")
print(soup.select('a[href]'))
'''
# print(soup.prettify())
prettify() 方法将Beautiful Soup的文档树格式化后以Unicode编码输出,每个XML/HTML标签都独占一行
'''
'''
Beautiful Soup为不同的解析器提供了相同的接口,但解析器本身时有区别的.
同一篇文档被不同的解析器解析后可能会生成不同结构的树型文档.
'''
'''
默认情况下,Beautiful Soup会将当前文档作为HTML格式解析,
如果要解析XML文档,要在 BeautifulSoup 构造方法中加入第二个参数 “xml”:
soup = BeautifulSoup(markup, "xml")
当然,还需要 安装lxml
'''
#coding: utf-8
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "html.parser")
# print(soup.prettify())
print(soup.title)
print(soup.title.name)
print(soup.a)
print(soup.p['class'])
print(soup.find_all('a'))
for link in soup.find_all('a'):
print(link.get('href'))
print(soup.get_text())
解析器:
Python标准库 BeautifulSoup(markup, “html.parser”)
lxml HTML 解析器 BeautifulSoup(markup, “lxml”) //推荐使用lxml作为解析器,因为效率更高
lxml XML 解析器
BeautifulSoup(markup, [“lxml-xml”])
BeautifulSoup(markup, “xml”)
html5lib BeautifulSoup(markup, “html5lib”)