1 BeautifulSoup4 简介
1.1 基本概念
Beautiful Soup 是⼀个可以从HTML或XML⽂件中提取数据的⽹⻚信息提取库
1.2 源码分析
github下载源码
安装
pip install lxml
pip install bs4
2 bs4的使⽤
2.1 快速开始
- 当源码比较乱的时候,可以用这个方式输出漂亮一些
import bs4
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 获取bs对象
soup = BeautifulSoup(html_doc,features='lxml') # 或者也行 soup = BeautifulSoup(html_doc,'lxml')
print(soup.prettify())
print(soup.title) # <title>The Dormouse's story</title>
print(soup.title.name)# title
print(soup.title.string) #The Dormouse's story
links = soup.find_all('a')
for link in links:
l = link.get('href')
print(l)
结果是找到里面的链接
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie
1 # 获取bs对象
2 bs = BeautifulSoup(html_doc,'lxml')
3 # 打印⽂档内容(把我们的标签更加规范的打印)
4 print(bs.prettify())
5 print(bs.title) # 获取title标签内容 <title>The Dormouse's story</title>
6 print(bs.title.name) # 获取title标签名称 title
7 print(bs.title.string) # title标签⾥⾯的⽂本内容 The Dormouse's story
8 print(bs.p) # 获取p段落
2.2 bs4的对象种类
tag : 标签
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 获取bs对象
soup = BeautifulSoup(html_doc,features='lxml')
print(type(soup))
print(type(soup.title))
print(type(soup.a))
print(type(soup.p))
结果
<class 'bs4.BeautifulSoup'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
NavigableString : 可导航的字符串
print(soup.p.string) # The Dormouse's story
print(type(soup.p.string)) # <class 'bs4.element.NavigableString'>
title_tag = soup.p
print(title_tag)
print(title_tag.name)
print(title_tag.string)
结果
<p class="title"><b>The Dormouse's story</b></p>
p
The Dormouse's story
BeautifulSoup : bs对象
Comment : 注释
title_tag = soup.p
html_comment = '<a><!--这是一段注释。注释不会在浏览器中显示。--></a>'
soup = BeautifulSoup(html_comment,'lxml')
print(soup.a.string)
print(type(soup.a.string))
结果
这是一段注释。注释不会在浏览器中显示。
<class 'bs4.element.Comment'>
3 遍历树 遍历⼦节点
bs⾥⾯有三种情况,第⼀个是遍历,第⼆个是查找,第三个是修改
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 获取bs对象
soup = BeautifulSoup(html_doc,features='lxml')
# tag
print(soup.title)
print(soup.p)
print(soup.p.b)
print(soup.a)
结果
<title>The Dormouse's story</title>
<p class="title"><b>The Dormouse's story</b></p>
<b>The Dormouse's story</b>
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# [属性]来取值
title_tag = soup.p
print(title_tag) # <p class="title"><b>The Dormouse's story</b></p>
print(title_tag['class']) # ['title']
3.1 contents children descendants
contents 返回的是⼀个列表
# contents 返回的是⼀个列表
links = soup.contents
print(type(links)) # <class 'list'>
print(links)
结果
<class 'list'>
[<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>]
# children 返回的是⼀个迭代器通过这个迭代器可以进⾏迭代
html = '''
<div>
<a href='#'>百度</a>
<a href='#'>阿里</a>
<a href='#'>腾讯</a>
</div>
'''
# 需要div标签下的数据
soup2 = BeautifulSoup(html,'lxml')
links = soup2.contents
print(type(links),links) # <class 'list'>
r = soup2.find_all('a')
print(r) # [<a href="#">百度</a>, <a href="#">阿里</a>, <a href="#">腾讯</a>]
for link in r:
print(link) # <a href="#">百度</a> <a href="#">阿里</a> <a href="#">腾讯</a>
for link in r:
print(link) # <a href="#">百度</a> <a href="#">阿里</a> <a href="#">腾讯</a>
print(link.string)
结果
<a href="#">百度</a>
百度
<a href="#">阿里</a>
阿里
<a href="#">腾讯</a>
腾讯
children 返回的是⼀个迭代器通过这个迭代器可以进⾏迭代
迭代是按照某种顺序,逐个访问列表中的每一项, 重复 循环(loop)是在满足一定的条件下 重复执行一段代码
python中 循环 while,而for是实现迭代的 for……in……
python中可以使用for关键字来逐个访问可迭代对象
soup2 = BeautifulSoup(html,'lxml')
links = soup2.div.children
print(type(links)) # <class 'list_iterator'>
for link in links:
print(link)
结果
<class 'list_iterator'>
<a href="#">百度</a>
<a href="#">阿里</a>
<a href="#">腾讯</a>
descendants 返回的是⼀个⽣成器遍历⼦⼦孙孙
3.2 .string .strings .stripped strings
string获取标签⾥⾯的内容
title_tag = soup.title
print(title_tag)
print(title_tag.string)
结果
<title>The Dormouse's story</title>
The Dormouse's story
还可以怎么获得
head_tag = soup.head
print(head_tag.string) #The Dormouse's story
如果获取html下面的多个呢,用string是取不到的
print(soup.html.string) #None
strings 返回是⼀个⽣成器对象⽤过来获取多个标签内容
print(soup.html.strings) # <generator object _all_strings at 0x00000185FC4D1A40>
strings = soup.strings
print(strings) # <generator object _all_strings at 0x000001FEA06A1A40>
strings = soup.strings
# print(strings) # <generator object _all_strings at 0x000001FEA06A1A40>
for s in strings:
print(s)
结果
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...
但是有好多空格
stripped_strings 和strings基本⼀致 但是它可以把多余的空格去掉
strings = soup.stripped_strings
for s in strings:
print(s)
结果
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...
4 遍历树 遍历⽗节点
parent 和 parents
parent直接获得⽗节点
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 获取bs对象
soup = BeautifulSoup(html_doc,features='lxml')
title_tag = soup.title
print(title_tag)
print(title_tag.parent)
结果
<title>The Dormouse's story</title>
<head><title>The Dormouse's story</title></head>
print(soup.html.parent)
结果
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
parents获取所有的⽗节点
a_tag = soup.a
print(a_tag) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(a_tag.parents) # 生成器对象<generator object parents at 0x000001DEAED01A40>
a_tag = soup.a
# print(a_tag) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# print(a_tag.parents) # 生成器对象<generator object parents at 0x000001DEAED01A40>
for x in a_tag.parents:
print(x)
print('……………………')
结果
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
……………………
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
……………………
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
……………………
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
……………………
5 遍历树 遍历兄弟结点
html = '<a><b>bbb</b><c>ccc</c><a>'
# 获取bs对象
soup = BeautifulSoup(html,features='lxml')
print(soup.prettify())
结果
<html>
<body>
<a>
<b>
bbb
</b>
<c>
ccc
</c>
</a>
<a>
</a>
</body>
</html>
next_sibling 下⼀个兄弟结点
html = '<a><b>bbb</b><c>ccc</c><a>'
# 获取bs对象
soup = BeautifulSoup(html,features='lxml')
# print(soup.prettify())
b_tag = soup.b
print(b_tag) # <b>bbb</b>
# 想要拿到b的兄弟标签c
print(b_tag.next_sibling)
结果
<b>bbb</b>
<c>ccc</c>
# 看看c标签下有没有兄弟
c_tag = soup.c
print(c_tag.next_sibling)
结果
None
previous_sibling 上⼀个兄弟结点
# 看看c标签上有没有兄弟
c_tag = soup.c
print(c_tag.previous_sibling)
结果
<b>bbb</b>
next_siblings 下⼀个所有兄弟结点
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,features='lxml')
a_tag = soup.a
print(a_tag.next_siblings)
for x in a_tag.next_siblings:
print(x)
print('=========')
结果
<generator object next_siblings at 0x000002B545321A40>
,
=========
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
=========
and
=========
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
=========
;
and they lived at the bottom of a well.
=========
previous_siblings上⼀个所有兄弟结点
c_tag = soup.c
print(c_tag.previous_siblings) # <generator object previous_siblings at 0x0000026435361A40>
for v in c_tag.previous_siblings:
print(v)
print('------------------')
结果
<generator object previous_siblings at 0x000001D9C7FE1A40>
<b>bbb</b>
------------------
6 搜索树
字符串过滤器
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,features='lxml')
a_tag = soup.find('a') # 'a' 这个a代表的是字符串过滤器
# a_tag2 = soup.a # 与上面的是有区别的
print(a_tag)
结果
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
a_tags = soup.find_all('a')
print(a_tags)
结果
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
正则表达式过滤器
我们⽤正则表达式⾥⾯compile⽅法编译⼀个正则表达式传给 find 或者
findall这个⽅法可以实现⼀个正则表达式的⼀个过滤器的搜索
其实没啥意思
from bs4 import BeautifulSoup
import re
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,features='lxml')
# 正则表达式过滤器
# 我们⽤正则表达式⾥⾯compile⽅法编译生成⼀个正则表达式传给 find 或者findall这个⽅法可以实现⼀个正则表达式的⼀个过滤器的搜索
# 我要找到所有t 打头的标签
print(soup.find_all(re.compile('t')))
结果
[<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>, <title>The Dormouse's story</title>]
列表过滤器
# 我想要找p标签和a标签
print(soup.find_all(['p','a']))
结果
[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, <p class="story">...</p>]
print(soup.find_all(['title','b'])) # [<title>The Dormouse's story</title>, <b>The Dormouse's story</b>]
True过滤器 感觉意义不是很大
# True
print(soup.find_all(True))
结果
[<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>, <head><title>The Dormouse's story</title></head>, <title>The Dormouse's story</title>, <body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>, <p class="title"><b>The Dormouse's story</b></p>, <b>The Dormouse's story</b>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, <p class="story">...</p>]
⽅法过滤器
# 方法过滤器
def fn(tag):
return tag.has_attr('id')
print(soup.find_all(fn)) #加()是调用函数,而这是要传入函数对象
结果
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
7 find_all() 和 find()
7.1 find_all()
# def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs):
# name :传一个tag 名称
# attrs:传一个标签的属性
# recursive:是指是否递归
# text:文本内容
# limit:限制返回的条数
# **kwargs:不定长参数 以关键字来传参
# a_tags = soup.find_all('a')
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
# def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs):
# name :传一个tag 名称
# attrs:传一个标签的属性
# recursive:是指是否递归搜索 遍历下面的子节点
# text:文本内容
# limit:限制返回的条数 取负数是没啥意义的 都是算第一个元素
# **kwargs:不定长参数 以关键字来传参
# a_tags = soup.find_all('a')
p_tags = soup.find_all('p','title')
print(p_tags)
结果
[<p class="title"><b>The Dormouse's story</b></p>]
print(soup.find_all(id = 'link1'))
结果
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# limit
print(soup.find_all('a',limit=1)) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# limit
print(soup.find_all('a',limit=2))
结果
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
# limit
print(soup.find_all('a',limit=1))
print(soup.a)
print(soup.find('a'))
结果
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# recursive
print(soup.find_all('a',recursive=True)) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print(soup.find_all('a',recursive=False)) # []
print(soup.find_all('a',limit=1)[0]) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(soup.find('a')) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
find_all()⽅法以列表形式返回所有的搜索到的标签数据
find()⽅法返回搜索到的第⼀条数据
find_all()⽅法参数
7.2 find_parents() find_parent() find_next_siblings() find_next_sibling()
find_parents() 搜索所有⽗亲,是个列表
s = soup.find(text='Elsie')
print(s.find_parents('p'))
结果
[<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>]
和previous有一拼
s = soup.find(text='Elsie')
print(s.find_previous('p'))
结果
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
find_parrent() 搜索单个⽗亲
title_tag = soup.title
print(title_tag.find_parent('head')) # <head><title>The Dormouse's story</title></head>
find_next_siblings()搜索所有兄弟
a_tag = soup.a
# print(a_tag) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# print(a_tag.find_next_sibling('a')) # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
print(a_tag.find_next_siblings('a'))
结果
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
find_next_sibling()搜索单个兄弟
a_tag = soup.a
print(a_tag) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(a_tag.find_next_sibling('a')) # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
7.3 find_previous_siblings() find_previous_sibling find_all_next() find_next()
find_previous_siblings() 往上搜索所有兄弟
find_previous_sibling() 往上搜索单个兄弟
a_tag = soup.find(id = 'link3')
print(a_tag) # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print(a_tag.find_previous_sibling()) # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
print(a_tag.find_previous_siblings()) # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
find_all_next() 往下搜索所有元素
p_tag = soup.p
print(p_tag.find_all_next())
结果
[<b>The Dormouse's story</b>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, <p class="story">...</p>]
find_next()往下查找单个元素
p_tag = soup.p
# print(p_tag.find_all_next())
print(p_tag.find_next('a'))
结果
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
8 修改⽂档树
修改tag的名称和属性
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
# 修改tag的名称和属性
tag_p = soup.p
print(tag_p) # <p class="title"><b>The Dormouse's story</b></p>
tag_p.name = 'w' # 修改名称
tag_p['class'] = 'content' # 修改属性
print(tag_p) # <w class="content"><b>The Dormouse's story</b></w>
修改string 属性赋值,就相当于⽤当前的内容替代了原来的内容
# 修改string 属性赋值,就相当于⽤当前的内容替代了原来的内容
tag_p = soup.p
print(tag_p) # <p class="title"><b>The Dormouse's story</b></p>
print(tag_p.string) # The Dormouse's story
tag_p.string = 'I love you'
print(tag_p) # <p class="title">I love you</p>
print(tag_p.string) # I love you
append() 像tag中添加内容,就好像Python的列表的 .append() ⽅法
# append() 像tag中添加内容,就好像Python的列表的 .append() ⽅法
tag_p = soup.p
print(tag_p) # <p class="title"><b>The Dormouse's story</b></p>
tag_p.append('hhhhh')
print(tag_p) # <p class="title"><b>The Dormouse's story</b>hhhhh</p>
decompose() 修改删除段落,对于⼀些没有必要的⽂章段落我们可以给他删
除掉
- 为啥 class_为啥加下划线,因为不过 class 是
python的关键词,这怎么办?加个下划线就可以。。。 请看下文
https://blog.csdn.net/qq_23077579/article/details/101323821
# decompose() 修改删除段落,对于⼀些没有必要的⽂章段落我们可以给他删除掉
result = soup.find(class_ = 'title')
result.decompose()
print(soup)
结果
将这一行删掉了
<p class="title"><b>The Dormouse's story</b></p>