Python 使用 beautifulsoup4

'''
安装:
pip install beautifulsoup4

项目地址:
https://pypi.org/project/beautifulsoup4/

Beautiful Soup 4.12.0 文档
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
'''
'''
安装解析器
pip install lxml
https://pypi.org/project/lxml/
'''
# -*- coding: UTF-8 -*-

# 主要使用 BeautifulSoup 类
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')
# with open("index.html") as fp:
#     soup = BeautifulSoup(fp, 'html.parser')
#
# soup = BeautifulSoup("<html>a web page</html>", 'html.parser')
# soup = BeautifulSoup(html_doc, 'lxml')

# 格式化网页
print(soup.prettify())

'''
输出:
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
'''


# 从文档中获取所有文字内容
print(soup.get_text())
print(soup.text)

'''
# The Dormouse's story
#
# The Dormouse's story
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
# 
...
'''

# find() 和 find_all()
# find方法返回的是查找到的第一个元素对象即一个tag对象,find_all方法返回的是列表
# find(self, name=None, attrs={}, recursive=True, text=None,**kwargs):
# find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)

# 根据标签名查找 , 获取文档a标签
print(soup.find_all(name='a'))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 根据属性名查找, 使用attrs来指定属性字典,进行查找
print(soup.find_all(attrs={'class': 'sister'}))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

print(soup.find_all(attrs={'class': 'sister'}, limit=2))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

# 根据文本查找,获取文档中文本为 Elsie 的标签文本
print(soup.find(text='Elsie'))
# ['Elsie']

# 查找所有的a标签和h4标签
tag_list = soup.find_all(['a', 'p'])
print(tag_list)
'''
[<p class="title"><b>The Dormouse's story</b></p>,
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, <p class="story">...</p>]
'''

# Tag 对象
# tag对象是beautifulsoup对象的find方法的返回值
# tag对象对应于原始文档中的HTML标签或XML标签
# tag有很多方法和属性,可用遍历文档树和搜索文档树获取标签内容
# 常见属性
# name:获取标签名称
# attrs:获取标签所有属性的键和值
# text:获取标签的文本字符串

a = soup.find(name='a')
print(type(a))
# <class 'bs4.element.Tag'>

print(a)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

print(a.name)
# a

# 标签属性,返回字典格式
print(a.attrs)
# {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}

# 获取属性值 1
print(a['class'])
# ['sister']

# 获取属性值 2
print(a.get('class'))
# ['sister']

# 标签内容
print(a.text)
# Elsie
其他:

# 多值属性
# tag 的 class 属性是 多值属性 。按照 CSS 类名搜索时,表示匹配到 tag 中任意 CSS 类名:

css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
print(css_soup.find_all("p", class_="strikeout"))
# [<p class="body strikeout"></p>]

print(css_soup.find_all("p", class_="body"))
# [<p class="body strikeout"></p>]

print(css_soup.find_all("p", class_="body strikeout"))
# [<p class="body strikeout"></p>]

print(css_soup.find_all(name="p", attrs={'class': 'strikeout'}))
# [<p class="body strikeout"></p>]

# 如果想要通过多个 CSS 类型来搜索 tag,应该使用 CSS 选择器
print(css_soup.select("p.strikeout.body"))
# [<p class="body strikeout"></p>]

# 通过select()方法找对应的标签
# 可以根据标签名、类名来定位,注意返回的是列表
# soup.select('a')      	##标签
# soup.select('.sister')    ##class类
# soup.select('#link1') 	##id类
'''
参考:
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
https://mp.weixin.qq.com/s/byR5pYSEhyTmlDHvjgBifg
https://mp.weixin.qq.com/s/AEdOWzo545pbDwfhZkv9zA
https://mp.weixin.qq.com/s/DsJ5cqAQkjSA3haS3Q7CyQ
https://mp.weixin.qq.com/s/Vz4C2tobVGzweXU-1BPiAA
'''
  • 5
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值