'''
安装:
pip install beautifulsoup4
项目地址:
https://pypi.org/project/beautifulsoup4/
Beautiful Soup 4.12.0 文档
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
'''
'''
安装解析器
pip install lxml
https://pypi.org/project/lxml/
'''
# -*- coding: UTF-8 -*-
# 主要使用 BeautifulSoup 类
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# with open("index.html") as fp:
# soup = BeautifulSoup(fp, 'html.parser')
#
# soup = BeautifulSoup("<html>a web page</html>", 'html.parser')
# soup = BeautifulSoup(html_doc, 'lxml')
# 格式化网页
print(soup.prettify())
'''
输出:
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
Elsie
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
'''
# 从文档中获取所有文字内容
print(soup.get_text())
print(soup.text)
'''
# The Dormouse's story
#
# The Dormouse's story
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
#
...
'''
# find() 和 find_all()
# find方法返回的是查找到的第一个元素对象即一个tag对象,find_all方法返回的是列表
# find(self, name=None, attrs={}, recursive=True, text=None,**kwargs):
# find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
# 根据标签名查找 , 获取文档a标签
print(soup.find_all(name='a'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# 根据属性名查找, 使用attrs来指定属性字典,进行查找
print(soup.find_all(attrs={'class': 'sister'}))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print(soup.find_all(attrs={'class': 'sister'}, limit=2))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
# 根据文本查找,获取文档中文本为 Elsie 的标签文本
print(soup.find(text='Elsie'))
# ['Elsie']
# 查找所有的a标签和h4标签
tag_list = soup.find_all(['a', 'p'])
print(tag_list)
'''
[<p class="title"><b>The Dormouse's story</b></p>,
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>,
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, <p class="story">...</p>]
'''
# Tag 对象
# tag对象是beautifulsoup对象的find方法的返回值
# tag对象对应于原始文档中的HTML标签或XML标签
# tag有很多方法和属性,可用遍历文档树和搜索文档树获取标签内容
# 常见属性
# name:获取标签名称
# attrs:获取标签所有属性的键和值
# text:获取标签的文本字符串
a = soup.find(name='a')
print(type(a))
# <class 'bs4.element.Tag'>
print(a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(a.name)
# a
# 标签属性,返回字典格式
print(a.attrs)
# {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
# 获取属性值 1
print(a['class'])
# ['sister']
# 获取属性值 2
print(a.get('class'))
# ['sister']
# 标签内容
print(a.text)
# Elsie
其他:
# 多值属性
# tag 的 class 属性是 多值属性 。按照 CSS 类名搜索时,表示匹配到 tag 中任意 CSS 类名:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
print(css_soup.find_all("p", class_="strikeout"))
# [<p class="body strikeout"></p>]
print(css_soup.find_all("p", class_="body"))
# [<p class="body strikeout"></p>]
print(css_soup.find_all("p", class_="body strikeout"))
# [<p class="body strikeout"></p>]
print(css_soup.find_all(name="p", attrs={'class': 'strikeout'}))
# [<p class="body strikeout"></p>]
# 如果想要通过多个 CSS 类型来搜索 tag,应该使用 CSS 选择器
print(css_soup.select("p.strikeout.body"))
# [<p class="body strikeout"></p>]
# 通过select()方法找对应的标签
# 可以根据标签名、类名来定位,注意返回的是列表
# soup.select('a') ##标签
# soup.select('.sister') ##class类
# soup.select('#link1') ##id类
'''
参考:
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
https://mp.weixin.qq.com/s/byR5pYSEhyTmlDHvjgBifg
https://mp.weixin.qq.com/s/AEdOWzo545pbDwfhZkv9zA
https://mp.weixin.qq.com/s/DsJ5cqAQkjSA3haS3Q7CyQ
https://mp.weixin.qq.com/s/Vz4C2tobVGzweXU-1BPiAA
'''