1.contents children descendants
2.string strings stripped_strings
Beautiful Soup 是一个可以从HTML或XML文件中提取数据的网页信息提取库。
(2)安装:pip install lxml
pip install bs4
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
# soup = BeautifulSoup(html_doc, features="lxml")
soup = BeautifulSoup(html_doc, "lxml")
# prettify() 把要解析的字符串以标准的缩进格式输出
# print(soup.prettify())
# print(soup.title) #<title>The Dormouse's story</title>
# string获取文本内容的
# print(soup.title.string) #The Dormouse's story
# name获取标签名
# print(soup.title.name) #title
# 需求获取所有p标签
# print(soup.p) #获取第一个
# tag = soup.find_all('p')
# # print(soup.find_all('p')) #获取所有p标签,以列表返回
# print(len(tag))
# for i in tag:
# print(i)
# 获取a标签的属性href
tag = soup.find_all('a')
for i in tag:
# print(i.get('href')) #1.get方法里面将属性名以字符串传进去
# print(i.attrs) #2.将属性各个放在字典中返回
# print(i.attrs['href'])
print(i['href']) #直接对i取href
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
soup = BeautifulSoup(html_doc, 'lxml')
# print(type(soup.title)) #<class 'bs4.element.Tag'>
# print(type(soup.title.string)) #<class 'bs4.element.NavigableString'>
# print(type(soup)) #<class 'bs4.BeautifulSoup'>
print(type(soup.span.string)) #<class 'bs4.element.Comment'>
三.遍历文档树 遍历子节点
1.contents children descendants
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
contents 返回的是一个所有子节点的列表
children 返回的是一个子节点的迭代器
descendants 返回的是一个生成器遍历子子孙孙
soup = BeautifulSoup(html_doc, 'lxml')
heads = soup.head
# print(heads.contents)
# print(heads.children)
# for head in heads.children:
# print(head)
# html = soup.html
# # for i in html.contents:
# # print(i)
# for i in html.descendants: #子子孙孙
# print(i)
2.string strings stripped_strings
string 获取标签里面的内容
strings 返回的是一个生成器对象用过来获取多个标签内容
stripped_strings 和strings基本一致 但是它可以把多余空格去掉
# title_tag = soup.title
# print(title_tag.string)
# html_tag = soup.html
# print(html_tag.strings)
# for i in html_tag.strings:
# print(i)
# html_tag = soup.html
# print(html_tag.stripped_strings)
# for i in html_tag.stripped_strings:
# print(i)
四.遍历文档树 遍历父节点
# t = soup.title
# print(t.parent)
# html = soup.html
# print(html.parent)
# print(type(html.parent)) #<class 'bs4.BeautifulSoup'>
# print(t.parents)
# for i in t.parents:
# print(i)
# print("*"*50)
五.遍历文档树 遍历兄弟节点
(1)next_sibling 下一个兄弟节点;
(2)previous_sibling 上一个兄弟节点;
(3)next_siblings 下一个所有兄弟节点;
(4)previous_siblings 上一个所有兄弟节点;
next_sibling 下一个兄弟节点
previous_sibling 上一个兄弟节点
next_siblings 下一个所有兄弟节点
previous_siblings 上一个所有兄弟节点
html2 = """
<title>The Dormouse's story</title>
<p class="story">p1</p>
<span class="story">span</span>
<p class="story">p2</p>
# soup2 = BeautifulSoup(html2, 'lxml')
# s = soup2.span
# print(s.previous_sibling) #空白或换行也会是被视作一个节点,所以是空的
# for i in s.previous_siblings:
# print(i)
from bs4 import BeautifulSoup
# find()
# find_all()
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
# 字符串过滤器
soup = BeautifulSoup(html_doc, 'lxml')
# find()
# 返回符合条件第一个结果
# result = soup.find('a')
# print(result)
# find_all() 以列表形式返回所有符合规则的
# result = soup.find_all('a')
# print(result)
# 列表过滤器
# result = soup.find_all(['title','b'])
# print(result)
七.find_all() 和 find()
def find_all(self, name=None, attrs={}, recursive=True, text=None,
limit=None, **kwargs):
html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
<tr class="h">
<td class="l" width="374">职位名称</td>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师(深圳)</a></td>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师(深圳)</a></td>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师(深圳)</a></td>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师(深圳)</a></td>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师(深圳)</a></td>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师(深圳)</a></td>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<tr class="odd">
<td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师(深圳)</a></td>
soup2 = BeautifulSoup(html, 'lxml')
# 获取所有的tr标签
# trs = soup2.find_all('tr')
# for t in trs:
# print(t)
# print('*'*50)
# 获取第三个tr标签 用列表下标索引取值 从0开始的
# trs = soup2.find_all('tr')
# print(trs[2])
# 通过属性值定位标签 找class="even"的tr标签
# trs = soup2.find_all('tr', attrs={'class': 'even'})
# for t in trs:
# print(t)
# print('*'*50)
# 注意:class后面一定要有下划线
# trs = soup2.find_all('tr', class_="even")
# for t in trs:
# print(t)
# print('*'*50)
# 获取到 class="even"并且id="test" 的tr标签 可以设置提取数量使用参数 limit = 数量
# trs = soup2.find_all('tr', class_="even", id="test")
# for t in trs:
# print(t)
# print('*'*50)
# 获取属性值
# a_tags = soup2.find_all('a')
# for i in a_tags:
# print(i['href'])
# 获取职位名称
trs = soup2.find_all('tr')[1:]
for t in trs:
# print(t)
# print('*'*50)
tds = t.find_all('td')[0]
CSS 选择器参考手册https://www.w3school.com.cn/cssref/css_selectors.asp
from bs4 import BeautifulSoup
# select_one() --> find()
# select() --> find_all()
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
soup = BeautifulSoup(html_doc,'lxml')
# select_one等价于find() 只会返回符合条件的第一个结果
# a_tag = soup.select_one('a')
# print(a_tag)
# select()返回所有
# a_tag = soup.select('a')
# print(a_tag)
# 获取class = “sister”的标签
# 选择class="into"的所有元素 --> .into
# 选择class="sister"的所有元素 --> .sister
# tags = soup.select('.sister')
# print(tags)
# 获取id=“link2”的标签
# 选择 id="firstname" 的元素 --》 #firstname
# 选择 id="link2" 的元素 --》 #link2
# tags = soup.select('#link2')
# print(tags)
# # 获取文本内容
# b_tag = soup.select('b')[0]
# # print(b_tag.string)
# print(b_tag.get_text())
html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
<tr class="h">
<td class="l" width="374">职位名称</td>
<tr class="even" id="test">
<td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师(深圳)</a></td>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
<tr class="even" id="test">
<td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师(深圳)</a></td>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师(深圳)</a></td>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师(深圳)</a></td>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师(深圳)</a></td>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师(深圳)</a></td>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<tr class="odd">
<td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师(深圳)</a></td>
soup2 = BeautifulSoup(html, 'lxml')
# 1.获取所有tr标签
# trs = soup2.select('tr')
# print(trs)
# 2.获取第三个tr标签
# trs = soup2.select('tr')[2]
# print(trs)
# 3.获取所有class=“even”的标签
# trs = soup2.select('.even')
# trs = soup2.select('tr[class="even"]')
# print(trs)
# 4.获取a标签里面的href属性值
# a_tags = soup2.select('a')
# # print(a_tags)
# for i in a_tags:
# print(i['href'])
# 5.获取职位名称
trs = soup2.select('tr')[1:]
for i in trs:
# tds = i.select('td')[0]
# print(tds.string)
# tds = i.contents
# print(tds[1].string)
(2)修改string 属性赋值,就相当于用当前的内容替代了原来的内容;
(3)append()像tag中添加内容,就好像Python的列表的 .append()方法;
from bs4 import BeautifulSoup
修改string 属性赋值,就相当于用当前的内容替代了原来的内容
append() 像tag中添加内容,就好像Python的列表的 .append()方法
decompose() 修改删除段落,对于一些没有必要的文章段落我们可以给他删除掉
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
soup = BeautifulSoup(html_doc,'lxml')
# p_tag = soup.p
# print(p_tag)
# # 修改标签名称
# p_tag.name = 'new_p'
# print(p_tag)
# # 修改属性
# p_tag['class'] = 'newclass'
# print(p_tag)
# 修改文本内容
# p_tag.string = 'new_string'
# p_tag.append('new') #添加内容
# print(p_tag)
# 删除内容
html = soup.html
title = soup.title #找到待删除的