[Python]网络爬虫 beautifulsoup4库基本操作

最新推荐文章于 2024-07-10 17:28:32 发布

ciao~chao

最新推荐文章于 2024-07-10 17:28:32 发布

阅读量107

点赞数

分类专栏： Python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/chao0401/article/details/110585738

版权

Python 专栏收录该内容

38 篇文章 2 订阅

订阅专栏

beautifulsoup4库基本操作

>>> soup.title #访问<title>标签的内容
<title>The Dormouse's story</title>

>>> soup.title.name #查看标签的名字
'title'

>>> soup.title.text #查看标签的文本
"The Dormouse's story"

>>> soup.title.string #查看标签的文本
"The Dormouse's story"

>>> soup.title.parent #查看上一级标签
<head><title>The Dormouse's story</title></head>
>>> soup.head
<head><title>The Dormouse's story</title></head>

>>> soup.b #访问<b>标签的内容
<b>The Dormouse's story</b>

>>> soup.body.b #访问<body>中<b>标签的内容
<b>The Dormouse's story</b>

>>> soup.name #把整个BeautifulSoup对象看作标签对象
'[document]' 

>>> soup.body #查看body标签内容
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

>>> soup.p #查看段落信息
<p class="title"><b>The Dormouse's story</b></p>

>>> soup.p['class'] #查看标签属性
['title']

>>> soup.p.get('class') #也可以这样查看标签属性
['title']

>>> soup.p.text #查看段落文本
"The Dormouse's story"

>>> soup.p.contents #查看段落内容
[<b>The Dormouse's story</b>]

>>> soup.a
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

>>> soup.a.attrs #查看标签所有属性
{'class': ['sister'], 'href': 'http://example.com/elsie', 'id': 'link1'}

>>> soup.find_all('a') #查找所有<a>标签
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> soup.find_all(['a', 'b']) #同时查找<a>和<b>标签
[<b>The Dormouse's story</b>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> import re
>>>soup.find_all(href=re.compile("elsie")) #查找href包含特定关键字的标签
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

>>> soup.find(id='link3') #查找属性id='link3'的标签
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

>>> soup.find_all('a', id='link3') #查找属性'link3'的a标签
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

>>> for link in soup.find_all('a'):
    print(link.text,':',link.get('href'))
Elsie : http://example.com/elsieLacie : http://example.com/lacieTillie : http://example.com/tillie

>>> print(soup.get_text()) #返回所有文本
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well
....

>>> soup.a['id'] = 'test_link1' #修改标签属性的值
>>> soup.a
<a class="sister" href="http://example.com/elsie" id="test_link1">Elsie</a>

>>> soup.a.string.replace_with('test_Elsie') #修改标签文本
'Elsie'

>>> soup.a.string
'test_Elsie'

>>> for child in soup.body.children: #遍历直接子标签
    print(child) 
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="test_link1">test_Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>

>>> test_doc = '<html><head></head><body><p></p><p></p></body></heml>'
>>> s = BeautifulSoup(test_doc, 'lxml')
>>> for child in s.html.children: #遍历直接子标签
    print(child)   
<head></head>
<body><p></p><p></p></body>

>>> for child in s.html.descendants: #遍历子孙标签
    print(child)   
<head></head>
<body><p></p><p></p></body>
<p></p>
<p></p>