文章目录
简介
Beautiful Soup 是 Python 的 HTML/XML 解析器,可以很好地处理不规范标记并生成剖析树(parse tree)。
Beautiful Soup 提供简单实用的导航,搜索以及修改剖析树的操作,大大节省编程时间。
安装
pip install lxml beautifulsoup4
初试
测试页面
<html>
<head><title>Page title</title></head>
<body>
<p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
<p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
</body>
</html>
长这样
代码
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
print(soup.contents[0].name) # 第一个节点的名字
# 'html'
print(soup.contents[0].contents[0].name) # 第一个节点的第一个节点的名字
# 'head'
head = soup.contents[0].contents[0]
print(head.parent.name) # 父节点
# 'html'
print(head.next) # 下一个节点
# <title>Page title</title>
print(head.nextSibling.name) # 下一个兄弟节点的名字
# 'body'
print(head.nextSibling.contents[0])
# <p id="firstpara" align="center">This is paragraph <b>one</b>.</p>
print(head.nextSibling.contents[0].nextSibling)
# <p id="secondpara" align="blah">This is paragraph <b>two</b>.</p>
搜索标签和属性
.
:取节点.string
:取内容bs4.BeautifulSoup('xxx')
:查找标签
import re
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
titleTag = soup.html.head.title # 取节点
print(titleTag)
# <title>Page title</title>
print(titleTag.string) # 取内容
# 'Page title'
print(soup('p')) # 查找标签
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
print(soup.findAll('p', align="center")) # 指定属性查找所有,相当于soup('p', align="center")
# [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>]
print(soup('p', align="center")) # 同上
print(soup.find('p', align="center")) # 只查找一个
# <p id="firstpara" align="center">This is paragraph <b>one</b>. </p>
print(soup('p', align="center")[0]['id']) # 取出id
# 'firstpara'
print(soup.find('p', align=re.compile('^b.*'))['id']) # 查找align为'b'开头的元素
# 'secondpara'
print(soup.find('p').b.string) # p元素 → b元素的内容
# 'one'
print(soup('p')[1].b.string) # 所有p元素 → 第二个 → b元素的内容
# 'two'
导航
属性 | 含义 |
---|---|
parent | 父节点 |
contents | 子节点 |
string | 字符串内容 |
nextSibling | 下一个兄弟节点 |
previousSibling | 上一个兄弟节点 |
next | 下一层处理次序 |
previous | 上一层处理次序 |
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
print(soup.head.parent) # 父节点
# <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>
print(soup.head.contents) # 子节点
print(soup.p.contents) # 子节点
# [<title>Page title</title>]
# ['This is paragraph ', <b>one</b>, '.']
print(soup.b.string) # 字符串内容
# one
print(soup.head.nextSibling) # 下一个兄弟节点
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>
print(soup.body.previousSibling) # 上一个兄弟节点
# <head><title>Page title</title></head>
print(soup.head.next) # 下一层处理次序
print(soup.head.next.next) # 下一层处理次序
print(soup.head.next.next.next) # 下一层处理次序
# <title>Page title</title>
# Page title
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>
print(soup.head.previous) # 上一层处理次序
# <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>
搜索
方法 | 含义 |
---|---|
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) | 所有匹配元素 |
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) | 第一个匹配元素 |
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs) | 后面所有匹配兄弟节点 |
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs) | 后面第一个匹配兄弟节点 |
def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs) | 前面所有匹配兄弟节点 |
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs) | 前面第一个匹配兄弟节点 |
def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs) | 下层所有匹配元素 |
def findNext(self, name=None, attrs={}, text=None, **kwargs) | 下层第一个匹配元素 |
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs) | 上层所有匹配元素 |
def findPrevious(self, name=None, attrs={}, text=None, **kwargs) | 上层第一个匹配元素 |
def findParents(self, name=None, attrs={}, limit=None, **kwargs) | 所有匹配父节点 |
def findParent(self, name=None, attrs={}, **kwargs) | 第一个匹配父节点 |
1. 所有匹配
def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
import re
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
print(soup.findAll('b')) # 提取所有匹配元素
# [<b>one</b>, <b>two</b>]
print(soup.findAll(re.compile('^b'))) # 以b开头
# [<body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>]
print(soup.findAll(['title', 'p'])) # title和p
# [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
print(soup.findAll({'title': True, 'p': True})) # 同上,更快
# [<title>Page title</title>, <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
print(soup.findAll(lambda tag: len(tag.attrs) == 2)) # 传一个返回布尔值的callable对象
print(soup.findAll(lambda tag: len(tag.name) == 1 and not tag.attrs)) # 单个字符的标签名且无属性
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<b>one</b>, <b>two</b>]
print(soup.findAll(align="center")) # 指定属性筛选
print(soup.findAll(id=re.compile("para$"))) # 可以传字符串,正则表达式,列表,哈希表
print(soup.findAll(align=["center", "blah"]))
print(soup.findAll(align=lambda value: value and len(value) < 5))
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
print(soup.findAll(align=True)) # 匹配有align属性的元素
print(soup.findAll(align=None)) # 匹配无align属性的元素
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <head><title>Page title</title></head>, <title>Page title</title>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <b>one</b>, <b>two</b>]
print(soup.findAll(id=re.compile("para$"))) # 与保留字有冲突时使用attrs参数,传入字典
print(soup.findAll(attrs={'id': re.compile("para$")}))
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
print(soup.findAll(text="one")) # 匹配内容
print(soup.findAll(text=["one", "two"])) # 可以传字符串,正则表达式,列表,哈希表
print(soup.findAll(text=re.compile("paragraph")))
print(soup.findAll(text=True))
print(soup.findAll(text=lambda x: len(x) < 12))
# ['one']
# ['one', 'two']
# ['This is paragraph ', 'This is paragraph ']
# ['Page title', 'This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.']
# ['Page title', 'one', '.', 'two', '.']
print([tag.name for tag in soup.html.findAll()]) # 默认递归遍历
print([tag.name for tag in soup.html.findAll(recursive=False)]) # 不递归遍历
# ['head', 'title', 'body', 'p', 'b', 'p', 'b']
# ['head', 'body']
print(soup.findAll('p', limit=1)) # 最大匹配个数
print(soup.findAll('p', limit=100)) #
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>]
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>]
2. 第一个匹配
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs)
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
print(soup.find('p')) # 提取第一个匹配元素
# <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>
3. 兄弟节点
所有兄弟节点
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)
一个兄弟节点
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs)
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
print(soup.find(text='This is paragraph ').findNextSiblings('b')) # 后面所有匹配兄弟节点
print(soup.find(text='This is paragraph ').findNextSibling(text=lambda text: len(text) == 1)) # 后面第一个匹配兄弟节点
print(soup.find(text='.').findPreviousSiblings('b')) # 前面所有匹配兄弟节点
print(soup.find(text='.').findPreviousSibling(text=True)) # 前面第一个匹配兄弟节点
# [<b>one</b>]
# .
# [<b>one</b>]
# This is paragraph
4. 上下层
下层所有匹配元素
def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs)
下层第一个匹配元素
def findNext(self, name=None, attrs={}, text=None, **kwargs)
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
print(soup.find('p').findAllNext(text=True)) # 下层所有含text的元素
print(soup.find('p').findNext('p')) # 第一个p的下一个p
print(soup.find('p').findNext('b')) # 第一个p的下一个b
# ['This is paragraph ', 'one', '.', 'This is paragraph ', 'two', '.']
# <p align="blah" id="secondpara">This is paragraph <b>two</b>.</p>
# <b>one</b>
print(soup('p')[-1].findAllPrevious(text=True)) # 上层所有含text的元素
print(soup('p')[-1].findPrevious('p'))
print(soup('p')[-1].findPrevious('b'))
# ['.', 'one', 'This is paragraph ', 'Page title']
# <p align="center" id="firstpara">This is paragraph <b>one</b>.</p>
# <b>one</b>
5. 父节点
所有匹配父节点
def findParents(self, name=None, attrs={}, limit=None, **kwargs)
第一个匹配父节点
def findParent(self, name=None, attrs={}, **kwargs)
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
print(soup.find('b').findParents()) # 所有匹配父节点
print(soup.find('b').findParent('body')) # 第一个匹配父节点
# [<p align="center" id="firstpara">This is paragraph <b>one</b>.</p>, <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>, <html><head><title>Page title</title></head><body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body></html>]
# <body><p align="center" id="firstpara">This is paragraph <b>one</b>.</p><p align="blah" id="secondpara">This is paragraph <b>two</b>.</p></body>
查找class
传参 class_
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p class="firstpara" align="center">This is paragraph <b>one</b>.</p><p class="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml') # 使用lxml解析器
print(soup(class_='firstpara'))
# [<p align="center" class="firstpara">This is paragraph <b>one</b>.</p>]
美化
prettify()
from bs4 import BeautifulSoup
content = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is paragraph <b>one</b>.</p><p id="secondpara" align="blah">This is paragraph <b>two</b>.</p></body></html>'''
soup = BeautifulSoup(content, 'lxml')
print(soup.prettify())
解析表格
使用 PrettyTable
库
pip install prettytable
from bs4 import BeautifulSoup
from itertools import zip_longest
from prettytable import PrettyTable
html = '''<html><body>
<table border="1">
<tr>
<th>学号</th>
<th>姓名</th>
</tr>
<tr>
<td>1</td>
<td>张三</td>
</tr>
<tr>
<td>2</td>
<td>李四</td>
</tr>
<tr>
<td>3</td>
<td>王五</td>
</tr>
</table>
</body>
</html>
'''
soup = BeautifulSoup(html, 'lxml')
th = soup('th') # 表头
th = [i.string for i in th]
td = soup('td') # 单元格
td = [i.string for i in td]
td = list(zip_longest(*([iter(td)] * len(th)))) # 根据th的长度分组
print(th)
print(td)
x = PrettyTable()
x.field_names = th # 表头
for i in td:
x.add_row(i) # 添加一行数据
print(x)
# ['学号', '姓名']
# [('1', '张三'), ('2', '李四'), ('3', '王五')]
# +------+------+
# | 学号 | 姓名 |
# +------+------+
# | 1 | 张三 |
# | 2 | 李四 |
# | 3 | 王五 |
# +------+------+
推荐阅读:Python表格美化库PrettyTable中文文档
修改树
推荐阅读:修改剖析树
删除特定class或id
from bs4 import BeautifulSoup
content = '''<html>
<body>
<div id="first" class="d"><p>1</p></div>
<div id="second" class="d"><p>2</p></div>
</body>
</html>'''
soup = BeautifulSoup(content, 'lxml')
for div in soup.find_all('div', {'class': 'd'}):
div.decompose()
print(soup.prettify())
# <html>
# <body>
# </body>
# </html>
报错 bs4.FeatureNotFound: Couldn’t find a tree builder with the features you requested
pip install wheel
pip install -U lxml