>>> soup.title #访问<title>标签的内容<title>The Dormouse's story</title>>>> soup.title.name #查看标签的名字'title'>>> soup.title.text #查看标签的文本"The Dormouse's story">>> soup.title.string #查看标签的文本"The Dormouse's story">>> soup.title.parent #查看上一级标签<head><title>The Dormouse's story</title></head>>>> soup.head
<head><title>The Dormouse's story</title></head>>>> soup.b #访问<b>标签的内容<b>The Dormouse's story</b>>>> soup.body.b #访问<body>中<b>标签的内容<b>The Dormouse's story</b>>>> soup.name #把整个BeautifulSoup对象看作标签对象'[document]'>>> soup.body #查看body标签内容<body><p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters;and their names were
<a class="sister" href="http://example.com/elsie"id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie"id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie"id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p></body>>>> soup.p #查看段落信息<p class="title"><b>The Dormouse's story</b></p>>>> soup.p['class']#查看标签属性['title']>>> soup.p.get('class')#也可以这样查看标签属性['title']>>> soup.p.text #查看段落文本"The Dormouse's story">>> soup.p.contents #查看段落内容[<b>The Dormouse's story</b>]>>> soup.a
<a class="sister" href="http://example.com/elsie"id="link1">Elsie</a>>>> soup.a.attrs #查看标签所有属性{'class':['sister'],'href':'http://example.com/elsie','id':'link1'}>>> soup.find_all('a')#查找所有<a>标签[<a class="sister" href="http://example.com/elsie"id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie"id="link2">Lacie</a>,<a class="sister" href="http://example.com/tillie"id="link3">Tillie</a>]>>> soup.find_all(['a','b'])#同时查找<a>和<b>标签[<b>The Dormouse's story</b>,<a class="sister" href="http://example.com/elsie"id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie"id="link2">Lacie</a>,<a class="sister" href="http://example.com/tillie"id="link3">Tillie</a>]>>>import re
>>>soup.find_all(href=re.compile("elsie"))#查找href包含特定关键字的标签[<a class="sister" href="http://example.com/elsie"id="link1">Elsie</a>]>>> soup.find(id='link3')#查找属性id='link3'的标签<a class="sister" href="http://example.com/tillie"id="link3">Tillie</a>>>> soup.find_all('a',id='link3')#查找属性'link3'的a标签[<a class="sister" href="http://example.com/tillie"id="link3">Tillie</a>]>>>for link in soup.find_all('a'):print(link.text,':',link.get('href'))
Elsie : http://example.com/elsieLacie : http://example.com/lacieTillie : http://example.com/tillie
>>>print(soup.get_text())#返回所有文本
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters;and their names were
Elsie,
Lacie and
Tillie;and they lived at the bottom of a well
....>>> soup.a['id']='test_link1'#修改标签属性的值>>> soup.a
<a class="sister" href="http://example.com/elsie"id="test_link1">Elsie</a>>>> soup.a.string.replace_with('test_Elsie')#修改标签文本'Elsie'>>> soup.a.string
'test_Elsie'>>>for child in soup.body.children:#遍历直接子标签print(child)<p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters;and their names were
<a class="sister" href="http://example.com/elsie"id="test_link1">test_Elsie</a>,<a class="sister" href="http://example.com/lacie"id="link2">Lacie</a>and<a class="sister" href="http://example.com/tillie"id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>>>> test_doc ='<html><head></head><body><p></p><p></p></body></heml>'>>> s = BeautifulSoup(test_doc,'lxml')>>>for child in s.html.children:#遍历直接子标签print(child)<head></head><body><p></p><p></p></body>>>>for child in s.html.descendants:#遍历子孙标签print(child)<head></head><body><p></p><p></p></body><p></p><p></p>