BeautifulSoup4学习笔记

.string–获取文本内容

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</span></b></p>>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
'''

# 1.导包
from bs4 import BeautifulSoup
# 2.实例化对象
soup = BeautifulSoup(html, 'lxml')   # 参数1:要解析的内容   参数2:解析器
# print(soup.prettify())  # 代码补全

# 通过标签选取,会返回包含标签本身及其里面的所有内容
print(soup.head)  # 包含head标签在内的所有内容
print(soup.p)     # 返回匹配的第一个结果

# .string是属性,作用是获取字符串文本
print(soup.html.head.title.string)    
print(soup.title.string)

获取名称
.name --获取标签本身名称

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</span></b></p>>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.title.name)   # 结果为标签本身名称 -->title
print(soup.p.name)       # --> 获取标签名

获取属性
.attrs[] --通过属性拿属性的值

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="title asdas" name="dromouse" id="qwe"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
'''
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

print(soup.p.attrs['name'])   # 获取p标签name属性的属性值

print(soup.p.attrs['id'])     # 获取p标签id属性的属性值

# 第二种写法
print(soup.p['id'])   

print(soup.p['class'])      # 以列表形式保存

print(soup.a['href'])  # 也是返回第一个值

嵌套选择

子父级关系

子节点和子孙节点

.contents 获取标签子节点,以列表形式返回

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
        
    </p>
    <p class="story">...</p>
'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

# 标签选择器只能拿到部分内容,不能拿到所有,那如何解决??
# print(soup.p.a)

# .contents属性可以将标签的子节点以列表的形式输出
# print(soup.p.contents)  # a是p的子节点,获取P标签所有的子节点内容,返回一个list

for i in soup.p.contents:
    print(i)

.children返回的是一个list类型的迭代器

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
        
    </p>
    <p class="story">...</p>
'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

# .children返回的是一个list类型的迭代器
# print(soup.p.children)  # 获取子节点 返回一个迭代器

# for i in soup.p.children:
#     print(i)

# print('----' * 5)
# enumerate() 函数用于将一个可遍历的数据对象添加一个索引序列
# 同时列出数据和数据下标,一般用在for循环当中

for i, child in enumerate(soup.p.children):
    print(i)
    print(child)

.descendants 获取子孙节点,返回的是一个生成器

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
        
    </p>
    <p class="story">...</p>
'''

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

# print(soup.p.descendants)

# print(enumerate(soup.p.descendants))

for i, child in enumerate(soup.p.descendants):
    print(i, child)

.parent 获取父节点

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
        
    </p>
    <p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent)  # 获取父节点

兄弟节点

.next_siblings 获取后面的兄弟节点

.previous_siblings 获取前面的兄弟节点

两者返回的都是一个生成器对象

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <span>abcqwe</span>
        sadaasasdasdasdsd
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <span>Elsie</span>
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}"></a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}"></a>
        
    </p>
    <p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.next_sibings)
print(list(enumerate(soup.a.previous_siblings))

实用:标准选择器

find_all(name, attrs, recursive, text, **kwargs)

可根据标签名、属性、内容查找文档
实用find_all()根据标签名查找

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <span>abcqwe</span>
        sadaasasdasdasdsd
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <span>Elsie</span>
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
        
    </p>
    <p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
# print(soup.find_all('a'))
print(soup.find_all('a')[0])

.string获取文本值

for ul in soup.find_all('p'):
#     print(ul)
    for i in ul.find_all('a'):
        print(i)
        print(i.string)

get_text()获取内容

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <span>abcqwe</span>
        sadaasasdasdasdsd
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <span>Elsie</span>
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
        
    </p>
    <p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.find_all('p'):
    for i in ul.find_all('a'):
        print(i.get_text())

实用find_all()根据属性查找

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <span>abcqwe</span>
        sadaasasdasdasdsd
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <span>Elsie</span>
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
        
    </p>
    <p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

# 第一种写法 通过attrs指定属性
# 语法格式:attrs={'属性':'属性名'}
print(soup.find_all(attrs={'class':'image-link'}))  # 根据class属性

# 第二种写法
# 语法格式:(属性=‘属性名’)
# print(soup.find_all(href="/films/1356063"))

# 特殊属性查找
# print(soup.find_all(class_='image-link'))   #class需做特殊处理 _


# 第三种写法  推荐的写法
print(soup.find_all('a', {'class':'image-link'}))

text属性值

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <span>abcqwe</span>
        sadaasasdasdasdsd
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <span>Elsie</span>
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
        
    </p>
    <p class="story">...</p>
'''

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

# 语法格式 :text='要查找的文本内容'
print(soup.find_all(text='1')) # 可以做内容统计用
print(soup.find_all(text='2')) 

find查找单个 find_all查找全部

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <span>abcqwe</span>
        sadaasasdasdasdsd
        <a href="/films/1417305" title="我和我的父辈" class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}">
          <span>Elsie</span>
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
        
    </p>
    <p class="story">...</p>
'''

from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find('a'))   # 如果没有则返回none
<a class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}" href="/films/1417305" title="我和我的父辈">
<span>Elsie</span>
</a>

css选择器

介绍

1、类别选择器 – class
2、标签选择器 –


3、ID选择器 – id

使用

通过select()直接传入css选择器

1、用CSS选择器时,标签名不加任何修饰,class类名前加.,id名前加#
2、用到的方法是soup.select(),返回类型是list
3、多个过滤器需要空格隔开,严格遵守从前往后逐层筛选

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <span>abcqwe</span>
        sadaasasdasdasdsd
        <a href="/films/1417305" title="我和我的父辈" class="image-link" id="list-1" data-act="boarditem-click" data-val="{movieId:1417305}">
          <span>Elsie</span>
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
        
    </p>
    <p class="story">...</p>
'''

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

# 根据标签去找 标签不加任何修饰 多个条件用空格隔开
# print(soup.select('p a'))

# 匹配class类名前加.
# print(soup.select('.score '))
# print(soup.select('.score .image-link'))

# 匹配id前面加#
print(soup.select('#list-1'))
[<a class="image-link" data-act="boarditem-click" data-val="{movieId:1417305}" href="/films/1417305" id="list-1" title="我和我的父辈">
<span>Elsie</span>
</a>]

获取属性的值

两种写法
1.ul[‘id’]
2.ul.attrs[‘id’]

html = '''
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="score">
        <span>abcqwe</span>
        sadaasasdasdasdsd
        <a href="/films/1417305" title="我和我的父辈" class="image-link" id="list-1" data-act="boarditem-click" data-val="{movieId:1417305}">
          <span>Elsie</span>
        </a>
        <a href="/films/1356063" title="峰爆" class="image-link" data-act="boarditem-click" data-val="{movieId:1356063}">1</a>
        <a href="/films/1328693" title="五个扑水的少年" class="image-link" data-act="boarditem-click" data-val="{movieId:1328693}">2</a>
        
    </p>
    <p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

for ul in soup.select('p'):
#     print(ul)
    print(ul['class'])
['score']
['story']
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
BeautifulSoup是一个Python库,用于从HTML或XML文件中提取数据。它提供了一种简单而灵活的方式来解析和遍历标记文档,并提供了许多有用的方法和属性来获取所需的信息。 在BeautifulSoup库的学习笔记中,介绍了BeautifulSoup库的简介和安装方法。它可以通过pip进行安装,并且有多种解析器可供选择。BeautifulSoup类有五种基本元素,包括Tag标签、Name名字、Attributes属性、NavigableString非属性字符串和Comment注释。它们可以用于遍历标签树的下行、上行和平行遍历。此外,BeautifulSoup几乎覆盖了HTML和XML中的所有内容,还包括一些特殊对象,例如文档的注释部分。 需要注意的是,BeautifulSoup对象本身不是真正的HTML或XML的tag,因此它没有name和attribute属性。但是,在某些情况下,查看它的.name属性是很方便的,因此BeautifulSoup对象包含了一个特殊属性.name,其值为"[document]"。另外,还有一些特殊对象,例如注释对象,可以通过使用BeautifulSoup库来处理。 综上所述,BeautifulSoup库提供了强大的解析和提取HTML或XML中数据的功能,适用于各种爬虫和数据提取任务。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* [BeautifulSoup学习笔记一](https://blog.csdn.net/weixin_43978546/article/details/104858873)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] - *3* [BeautifulSoup 学习笔记](https://blog.csdn.net/zhengjian0617/article/details/81142540)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小刘私坊

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值