1.快速开始
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc)
# print(soup.prettify()) # 结构化输出文档
print(soup.title) # 获取title标签
print(soup.title.name) # 获取title标签名称
print(soup.title.parent.name)
print(soup.p['class'])
# 输出结果:
# <title>The Dormouse's story</title>
# title
# head
# ['title']
for link in soup.find_all('a'):
print(link.get('href'))
# 输出结果:
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie
print(soup.p.get_text()) # 存在多个相同标签则只返回第一个,可以在get_text()加入参数get_text("\n",strip=True)来去除多余的空格或者换行,返回string对象
print(soup.p.string) # 获取标签里面的内容,可以用.stripped_strings获得去除前后空白的Python的string对象
print(soup.p.name) # 获取标签的名字
print(soup.p.attrs) # 获取标签的属性
print(soup.a) # 存在多个相同标签则只返回第一个
# 输出结果:
# The Dormouse's story
# The Dormouse's story
# p
# {'class': ['title']}
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# 特别的情况
html = """<p id='list-1'>
哈哈哈哈
<a class='sss'>
<span>
<h1>aaaa</h1>
</span></a>
<b>bbbbb</b>
</p>"""
soup1 = BeautifulSoup(html)
print(soup1.p.string) # 如果一个标签下还有多个其它标签,则只会输出None
print(soup1.p.get_text(',',strip=True)) # 可以使用get_text()获取该标签下面的所有文本
# 输出结果:
# None
# 哈哈哈哈,aaaa,bbbbb
# 可以嵌套查询
print(soup.head.title.string)
print(soup.body.a.string)
# 输出结果:
# The Dormouse's story
# Elsie
2.获取子节点和子孙节点
# <html><head><title>The Dormouse's story</title></head>
# <body>
# <p class="title">
# <b>The Dormouse's story</b>
# Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">
# <span>Elsie</span>
# </a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.
# </p>
# <p class="story">...</p>
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b>Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'lxml')
print("获取p下所有子节点:",soup.p.contents,end='\n\n') # 获取p下所有子节点
print("得到一个迭代器,包含p下面所有子节点:",soup.p.children,end='\n\n') # 得到一个迭代器,包含p下面所有子节点
print("迭代p标签下面的所有子节点:")
for i,child in enumerate(soup.p.children):
print(i,child)
print('\n')
print("得到一个迭代器,包含p下面所有子节点以及子孙节点:",soup.p.descendants,end='\n\n')# 得到一个迭代器,包含p下面所有子节点和子孙节点
print("迭代p标签下面的所有子节点和子孙节点:")
for i,child in enumerate(soup.p.descendants):
print(i,child)
输出结果:
获取p下所有子节点: [<b>The Dormouse's story</b>, 'Once upon a time there were three little sisters; and their names were', <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>, ',', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';and they lived at the bottom of a well.']
得到一个迭代器,包含p下面所有子节点: <list_iterator object at 0x000001DF241BA400>
迭代p标签下面的所有子节点:
0 <b>The Dormouse's story</b>
1 Once upon a time there were three little sisters; and their names were
2 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
3 ,
4 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
5 and
6 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
7 ;and they lived at the bottom of a well.
得到一个迭代器,包含p下面所有子节点以及子孙节点: <generator object descendants at 0x000001DF2410C570>
迭代p标签下面的所有子节点和子孙节点:
0 <b>The Dormouse's story</b>
1 The Dormouse's story
2 Once upon a time there were three little sisters; and their names were
3 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
4 <span>Elsie</span>
5 Elsie
6 ,
7 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
8 Lacie
9 and
10 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
11 Tillie
12 ;and they lived at the bottom of a well.
3.获取父节点和祖先节点
print('找到a标签的父节点下面的所有节点(同一个父亲的节点):',soup.a.parent,'\n') # 找到a标签的父节点下面的所有节点(同一个父亲的节点)
print('得到一个迭代器,包含a标签的祖先点下面的所有节点(同一个祖先的节点):',soup.a.parents,'\n') # 得到一个迭代器,包含a标签的祖先点下面的所有节点(同一个祖先的节点)
for i,parent in enumerate(soup.a.parents):
print(i,parent)
输出结果:
找到a标签的父节点下面的所有节点(同一个父亲的节点): <p class="title"><b>The Dormouse's story</b>Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p>
得到一个迭代器,包含a标签的祖先点下面的所有节点(同一个祖先的节点): <generator object parents at 0x000001DF23F59AF0>
0 <p class="title"><b>The Dormouse's story</b>Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p>
1 <body><p class="title"><b>The Dormouse's story</b>Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p></body>
2 <html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b>Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p></body></html>
3 <html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b>Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p></body></html>
4.获取兄弟节点
print("得到兄弟节点的迭代器")
print(soup.a.next_siblings) # 得到兄弟节点的迭代器
for i,brother in enumerate(soup.a.next_siblings):
print(i,brother)
print("得到与a标签同一个父节点的前面的所有标签")
print(soup.a.previous_siblings) # 得到与a标签同一个父节点的前面的所有标签
for i,previous in enumerate(soup.a.previous_siblings):
print(i,previous)
输出结果:
<generator object next_siblings at 0x000001DF2408EBA0>
0 ,
1 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
2 and
3 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
4 ;and they lived at the bottom of a well.
<generator object previous_siblings at 0x000001DF2408EB48>
0 Once upon a time there were three little sisters; and their names were
1 <b>The Dormouse's story</b>
5.find_all()方法和find()方法
根据标签来查询
# find_all()和find()用法完全一样,可根据标签名,属性,内容查找文档,但find()只返回第一个
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
<b>The Dormouse's story</b>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'lxml')
# find_all()和find()用法完全一样,可根据标签名,属性,内容查找文档,但find()只返回第一个
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
<b>The Dormouse's story</b>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'lxml')
print(soup.find_all('a')) # find_all()返回的是一个列表,里面每一个元素的属性都是<class 'bs4.element.Tag'>
print(soup.find_all('a',id="link3"))
print(soup.find_all('a',id="link3",attrs={'class':"sister"}))
print(soup.find_all('a')[0].find('span').contents) # 嵌套查找,.contents可以将tag的子节点以列表的形式展现,并且只有<class 'bs4.element.Tag'>元素才可以用contents
输出结果:
[<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
['Elsie']
根据属性来查询
print(soup.find_all(attrs={'id':'link1'}))
print(soup.find_all(attrs={'class':'sister'}))
print(soup.find_all(class_='sister'))
输出结果:
[<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>]
[<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
6.使用select选择器
# 使用select方法根据css属性来选择
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
<b>The Dormouse's story</b>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div class='panel-1'>
<ul class='list' id='list-1'>
<li class='element'>Foo</li>
<li class='element'>Bar</li>
<li class='element'>Jay</li>
</ul>
<ul class='list list-small' id='list-2'>
<li class='element'><h1 class='yyyy'>Foo</h1></li>
<li class='element xxx'>Bar</li>
<li class='element'>Jay</li>
</ul>
</div>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'lxml')
print(soup.p.select('.sister'))
print(soup.select('.sister span')) # 如果有多个标签的class属性一样,则只返回第一个
# 输出结果:
# [<a class="sister" href="http://example.com/elsie" id="link1">
# <span>Elsie</span>
# </a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# [<span>Elsie</span>]
print(soup.select('#link1')) # 根据id属性选择
print(soup.select('#link1 span')) # 多个css属性放在一起选择则是嵌套选择
# 输出结果:
# [<a class="sister" href="http://example.com/elsie" id="link1">
# <span>Elsie</span>
# </a>]
# [<span>Elsie</span>]
print(soup.select('#list-2')) # 根据id属性选择
print(soup.select('#list-2 .element.xxx')) # 多个css属性放在一起选择则是嵌套选择
print(soup.select('#list-2')[0].select('.element')) # 可以嵌套调用多个select方法
# 输出结果:
# [<ul class="list list-small" id="list-2">
# <li class="element"><h1 class="yyyy">Foo</h1></li>
# <li class="element xxx">Bar</li>
# <li class="element">Jay</li>
# </ul>]
# [<li class="element xxx">Bar</li>]
# [<li class="element"><h1 class="yyyy">Foo</h1></li>, <li class="element xxx">Bar</li>, <li class="element">Jay</li>]
print(soup.select('#list-2 h1')[0].attrs) # 获取属性
print(soup.select('#list-2 h1')[0].get_text()) # 获取文本
# 输出结果:
# {'class': ['yyyy']}
# Foo