.string–获取文本内容
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</span></b></p>>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
# 1.导包
from bs4 import BeautifulSoup
# 2.实例化对象
soup = BeautifulSoup(html, 'lxml') # 参数1:要解析的内容 参数2:解析器
# print(soup.prettify()) # 代码补全
# 通过标签选取,会返回包含标签本身及其里面的所有内容
print(soup.head) # 包含head标签在内的所有内容
print(soup.p) # 返回匹配的第一个结果
# .string是属性,作用是获取字符串文本
print(soup.html.head.title.string)
print(soup.title.string)
获取名称
.name --获取标签本身名称
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</span></b></p>>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title.name) # 结果为标签本身名称 -->title
print(soup.p.name) # --> 获取标签名
获取属性
.attrs[] --通过属性拿属性的值
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title asdas" name="dromouse" id="qwe"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.attrs['name']) # 获取p标签name属性的属性值
print(soup.p.attrs['id']) # 获取p标签id属性的属性值
# 第二种写法
print(soup.p['id'])
print(soup.p['class']) # 以列表形式保存
print(soup.a['href']) # 也是返回第一个值
嵌套选择
子父级关系
子节点和子孙节点
.contents 获取标签子节点,以列表形式返回
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# 标签选择器只能拿到部分内容,不能拿到所有,那如何解决??
# print(soup.p.a)
# .contents属性可以将标签的子节点以列表的形式输出
# print(soup.p.contents) # a是p的子节点,获取P标签所有的子节点内容,返回一个list
for i in soup.p.contents:
print(i)
.children返回的是一个list类型的迭代器
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# .children返回的是一个list类型的迭代器
# print(soup.p.children) # 获取子节点 返回一个迭代器
# for i in soup.p.children:
# print(i)
# print('----' * 5)
# enumerate() 函数用于将一个可遍历的数据对象添加一个索引序列
# 同时列出数据和数据下标,一般用在for循环当中
for i, child in enumerate(soup.p.children):
print(i)
print(child)
.descendants 获取子孙节点,返回的是一个生成器
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# print(soup.p.descendants)
# print(enumerate(soup.p.descendants))
for i, child in enumerate(soup.p.descendants):
print(i, child)
.parent 获取父节点
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent) # 获取父节点
兄弟节点
.next_siblings 获取后面的兄弟节点
.previous_siblings 获取前面的兄弟节点
两者返回的都是一个生成器对象
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<span>abcqwe</span>
sadaasasdasdasdsd
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<span>Elsie</span>
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibings)
print(list(enumerate(soup.a.previous_siblings))
实用:标准选择器
find_all(name, attrs, recursive, text, **kwargs)
可根据标签名、属性、内容查找文档
实用find_all()根据标签名查找
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<span>abcqwe</span>
sadaasasdasdasdsd
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<span>Elsie</span>
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# print(soup.find_all('a'))
print(soup.find_all('a')[0])
.string获取文本值
for ul in soup.find_all('p'):
# print(ul)
for i in ul.find_all('a'):
print(i)
print(i.string)
get_text()获取内容
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<span>abcqwe</span>
sadaasasdasdasdsd
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<span>Elsie</span>
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.find_all('p'):
for i in ul.find_all('a'):
print(i.get_text())
实用find_all()根据属性查找
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<span>abcqwe</span>
sadaasasdasdasdsd
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<span>Elsie</span>
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# 第一种写法 通过attrs指定属性
# 语法格式:attrs={'属性':'属性名'}
print(soup.find_all(attrs={'class':'image-link'})) # 根据class属性
# 第二种写法
# 语法格式:(属性=‘属性名’)
# print(soup.find_all(href="/films/1356063"))
# 特殊属性查找
# print(soup.find_all(class_='image-link')) #class需做特殊处理 _
# 第三种写法 推荐的写法
print(soup.find_all('a', {'class':'image-link'}))
text属性值
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<span>abcqwe</span>
sadaasasdasdasdsd
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<span>Elsie</span>
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# 语法格式 :text='要查找的文本内容'
print(soup.find_all(text='1')) # 可以做内容统计用
print(soup.find_all(text='2'))
find查找单个 find_all查找全部
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<span>abcqwe</span>
sadaasasdasdasdsd
<a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
<span>Elsie</span>
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find('a')) # 如果没有则返回none
<a class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}" href="/films/1417305" title="我和我的父辈">
<span>Elsie</span>
</a>
css选择器
介绍
1、类别选择器 – class
2、标签选择器 –
3、ID选择器 – id
使用
通过select()直接传入css选择器
1、用CSS选择器时,标签名不加任何修饰,class类名前加.,id名前加#
2、用到的方法是soup.select(),返回类型是list
3、多个过滤器需要空格隔开,严格遵守从前往后逐层筛选
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<span>abcqwe</span>
sadaasasdasdasdsd
<a href="/films/1417305" title="我和我的父辈" class="image-link" id="list-1" data-act="boarditem-click" data-val="{movieId:1417305}">
<span>Elsie</span>
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# 根据标签去找 标签不加任何修饰 多个条件用空格隔开
# print(soup.select('p a'))
# 匹配class类名前加.
# print(soup.select('.score '))
# print(soup.select('.score .image-link'))
# 匹配id前面加#
print(soup.select('#list-1'))
[<a class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}" href="/films/1417305" id="list-1" title="我和我的父辈">
<span>Elsie</span>
</a>]
获取属性的值
两种写法
1.ul[‘id’]
2.ul.attrs[‘id’]
html = '''
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="score">
<span>abcqwe</span>
sadaasasdasdasdsd
<a href="/films/1417305" title="我和我的父辈" class="image-link" id="list-1" data-act="boarditem-click" data-val="{movieId:1417305}">
<span>Elsie</span>
</a>
<a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
<a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('p'):
# print(ul)
print(ul['class'])
['score']
['story']