BeautifulSoup库的基本使用
1.库的基本使用
1.1 安装库
pip install beautifulsoup4==4.9.1
pip install lxml # 解析器更加强大,速度更快
1.2 导入库
from bs4 import BeautifulSoup
1.3 不同文件操作
html
soup=BeautifulSoup(html,'lxml') print(soup) # 打印html print(soup.prettify()) # 打印美化的html【索引格式化的html】
html
文件soup = BeautifulSoup(open("index.html"),'lxml')
2. 获取html结构化数据
- 获取标签
title
soup.title
- 获取标签
head
soup.head
- 获取标签
name
soup.title.name
- 获取标签
title
内的内容soup.title.string
- 获取
title
父级标签soup.title.parent
- 获取
title
父级标签名称soup.title.parent.name
- 获取标签属性
soup.div.attrs
- 获取所有文字内容
print(soup.get_text())
3.节点操作
3.1 获取子节点
from bs4 import BeautifulSoup
html_doc = """<html><div><p>p content</p></div><html>"""
soup = BeautifulSoup(html_doc, 'lxml')
- 获取
div
节点div_tag = soup.div print(div_tag) # <div><p>p content</p></div>
- 获取
div
节点内部内容print(div_tag.contents)#[<p>p content</p>] 列表 title_tag = div_tag.contents[0] print(title_tag) # <p>p content</p>
- 获取
div
节点内部p
标签内容title_tag = div_tag.contents[0] print(title_tag) # <p>p content</p> print(title_tag.contents) # ['p content']
- 获取所有子节点【直接子节点】
for child in soup.p.children: print(child)
- 获取所有子节点【子孙后代节点递归循环】
for child in divs.descendants: print(child)
3.2 获取节点内容
3.2.1 单个内容
from bs4 import BeautifulSoup
html_doc = """<html><div><p>p content</p></div><html>"""
soup = BeautifulSoup(html_doc, 'lxml')
string
【返回生成器generator
】- 获取
div
节点内容print (soup.div.string) # p content
- 获取
p
节点内容print (soup.p.string) # p content
- 获取
text
【返回内容为字符串类型】- 获取
div
节点内容print (soup.div.text) # p content
- 获取
p
节点内容print (soup.p.text) # p content
- 获取
3.2.2 多个内容
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>p data</title>
</head>
<body>
<p class="title"><b>b data</b></p>
<p class="story">
p before
<a href="http://example.com/a1" class="sister" id="link1">a1</a>
<a href="http://example.com/a2" class="sister" id="link2">a2</a>
<a href="http://example.com/a3" class="sister" id="link3">a3</a>
p end
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
strings
- 遍历获取内容,存在空白内容
print(soup.strings) for string in soup.strings: print(repr(string)) # <generator object Tag._all_strings at 0x00000203DA9B5430> # '\n' # '\n' # 'p data' # '\n' # '\n' # '\n' # 'b data' # '\n' # '\n p before\n ' # 'a1' # '\n' # 'a2' # '\n' # 'a3' # '\n p end\n ' # '\n' # '\n' # '\n'
- 遍历获取内容,存在空白内容
stripped_strings
- 遍历获取内容,不存在空白内容
print(soup.stripped_strings) for string in soup.stripped_strings: print(repr(string)) # <generator object Tag.stripped_strings at 0x0000024DF5675430> # 'p data' # 'b data' # 'p before' # 'a1' # 'a2' # 'a3' # 'p end'
- 遍历获取内容,不存在空白内容
4. 搜索匹配节点
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>p data</title>
</head>
<body>
<p class="title"><b>b data</b></p>
<p class="story">
p before
<a href="http://example.com/a1" class="sister" id="link1">a1</a>
<a href="http://example.com/a2" class="sister" id="link2">a2</a>
<a href="http://example.com/a3" class="sister" id="link3">a3</a>
p end
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
4.1find_all
find_all( name , attrs , recursive , string , **kwargs )
- 符合过滤器条件的所有节点
4.1.1 标签名查找
- 根据
name
标签名查找print(soup.find_all("a")) # 查找a标签 # [<a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
- 根据
id
查找print(soup.find_all(id="link1")) # [<a class="sister" href="http://example.com/a2" id="link2">a2</a>]
- 多条件标签查找【条件列表】
print(soup.find_all(["title", "b"])) # [<title>p data</title>, <b>b data</b>]
- 根据正则表达式查找
import re for tag in soup.find_all(re.compile("^b")): print(tag.name) # body # b
4.1.2 标签属性查找
-
根据
id
查找id
匹配print(soup.find_all(id="link1")) # [<a class="sister" href="http://example.com/a2" id="link2">a2</a>]
- 匹配存在
id
属性的标签print(soup.find_all(id=True)) #[<a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
-
根据
href
查找import re print(soup.find_all(href=re.compile("1"))) # [<a class="sister" href="http://example.com/a1" id="link1">a1</a>]
-
根据
text
查找import re print(soup.find_all(text=re.compile("^a"))) # ['a1', 'a2', 'a3']
-
根据
class
选择器查找import re print(soup.find_all(class_=re.compile("s"))) # [<p class="story"> # p before # <a class="sister" href="http://example.com/a1" id="link1">a1</a> # <a class="sister" href="http://example.com/a2" id="link2">a2</a> # <a class="sister" href="http://example.com/a3" id="link3">a3</a> # p end # </p>, <a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
4.1.3 多条件匹配
import re
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1</a>
<a href="http://example.com/a2" class="c2" id="link2">a2</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
soup.find_all(href=re.compile("1"), id='link1')
print(soup.find_all("a", class_="c1"))
print(soup.find_all(attrs={"data_a1": "data1"}))
print(soup.find_all('p', class_="story", id="p1"))
print(soup.find_all('p', attrs={"class":"story", "id":"p1"}))
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a>]
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a>]
# [<p class="story" id="p1"> <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a><a class="c2" href="http://example.com/a2" id="link2">a2</a></p>]
# [<p class="story" id="p1"><a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a><a class="c2" href="http://example.com/a2" id="link2">a2</a></p>]
4.1.4 text 参数内容匹配
import re
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
<a href="http://example.com/a2" class="c2" id="link2">a2 content</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
- 内容匹配
print(soup.find_all(text="a1 text")) # ['a1 text']
- 多条件匹配
print(soup.find_all(text=["a1 text", "a2 content"])) # ['a1 text', 'a2 content']
- 模糊匹配
print(soup.find_all(text=re.compile("n"))) # ['a2 content']
4.1.5 limit 参数内容限制数量
- 限制返回长度
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
<a href="http://example.com/a2" class="c2" id="link2">a2 content</a>
<a href="http://example.com/a2" class="c2" id="link3">a2 content</a>
<a href="http://example.com/a2" class="c2" id="link4">a2 content</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.find_all("a",limit=2))
print(soup.find_all("a")[0:2])
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>, <a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>, <a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]
4.2 find
find( name , attrs , recursive , string , **kwargs )
- 符合过滤器条件的第一个节点
- 只想获取第一个,就可以用
find
,否则用find_all
from bs4 import BeautifulSoup html_doc = """ <html> <body> <p class="story" id="p1"> <a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a> <a href="http://example.com/a2" class="c2" id="link2">a2 content</a> </p> </body> </html> """ soup = BeautifulSoup(html_doc, 'lxml') print(soup.find('a')) print(soup.find("notag")) # find_all()` 方法没有找到目标是返回空列表, `find()` 方法找不到目标时,返回 `None` . print(soup.find("p").find("a")) # <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a> # None # <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>
4.3 find_parents() 和 find_parent()
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>p data</title>
</head>
<body>
<p class="title"><b>b data</b></p>
<p class="story">
p before
<a href="http://example.com/a1" class="sister" id="link1">a1</a>
<a href="http://example.com/a2" class="sister" id="link2">a2</a>
<a href="http://example.com/a3" class="sister" id="link3">a3</a>
p end
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
a_string = soup.find(text="a1")
print(a_string.find_parent()) # 亲父亲
print(a_string.find_parents()) # 所有前辈的父节点
print(a_string.find_parent("p"))
5. css选择器查找
5.1通过class查找
print(soup.select(".c1"))
5.2 通过id查找
print(soup.select("#link1"))
5.3 组合查找
- 组合查找
print(soup.select("p #link2"))
- 子标签查找【下一级直系元素】
print(soup.select("p > #link2"))
- 标签属性
print(soup.find('a', class_="c1").img.attrs['src'])
- 既有
class
也有id
选择器的标签soup.select(".story#test")
- 多个
class
选择器的标签soup.select(".story.c1")
- 多个
class
选择器和一个id
选择器的标签soup.select(".story.data1#book")
5.4 属性查找
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
<a href="http://example.com/a2" class="c2" id="link2">a2 content</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.select("a[href='http://example.com/a2']"))
# [<a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]
6.其他操作
6.1 查找选择框
trs=soup.select('#TableList table tr')[1:]
for tr in trs: # 遍历选择框数据
tds=tr.select('td')
td_list=[]
for td in tds:
text=td.text
td_list.append(text)
6.2 函数使用
6.2.1 get(),获取元素属性
div.find('img').get('src')