from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head><body><pclass="title"><b>The Dormouse's story</b></p><pclass="story">Once upon a time there were three little sisters; and their names were
<ahref="http://example.com/elsie"class="sister"id="link1">Elsie</a>,
<ahref="http://example.com/lacie"class="sister"id="link2">Lacie</a> and
<ahref="http://example.com/tillie"class="sister"id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><pclass="story">...</p>
"""
#创建一个BeautifulSoup对象soup
soup = BeautifulSoup(html_doc)
#soup 对象本身比较特殊,它的 name 即为 [document]
print(soup.name)
#利用 soup.标签名可以轻松地获取这些标签的内容
#但是查找的是在所有内容中的第一个符合要求的标签
print(soup.title)
print(soup.a)
#对于其他内部标签,输出的值便为标签本身的名称
print(soup.head.name)
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head><body><pclass="title"><b>The Dormouse's story</b></p><pclass="story">Once upon a time there were three little sisters; and their names were
<ahref="http://example.com/elsie"class="sister"id="link1">Elsie</a>,
<ahref="http://example.com/lacie"class="sister"id="link2">Lacie</a> and
<ahref="http://example.com/tillie"class="sister"id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><pclass="story">...</p>
"""
#创建一个BeautifulSoup对象
soup = BeautifulSoup(html_doc)
#输出BeautifulSoup对象类型
print(type(soup.name))
#输出BeautifulSoup对象名字
print(soup.name)
#输出BeautifulSoup对象属性
print(soup.attrs)
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""#创建一个BeautifulSoup对象soup
soup = BeautifulSoup(html_doc)
#操作文档树最简单的方法就是告诉它你想获取标签的name#如果想获取 <head> 标签,只要用 soup.head
print(soup.head)
#获取<body>标签中的第一个<b>标签
print(soup.body.b)
#通过点取属性的方式只能获得当前名字的第一个标签#只能获取soup对象中的第一个a
print(soup.a)
#通过.contents 可以将当前标签的子标签以列表的方式输出
print(soup.head.contents)
#通过.children可以找出当前标签的子标签for child in soup.head.contents[0].children:
print(child)
#通过.descents可以找出当前标签的所有后代标签for child in soup.head.descendants:
print(child)
#如果标签只有一个 NavigableString 类型子标签,那么这个标签可以使用 .string 得到子节点
print(soup.head.contents[0].string)
#如果一个标签仅有一个子标签,那么这个标签也可以使用 .string 方法
print(soup.head.string)
#如果标签中包含多个字符串 ,可以使用 .strings 来循环获取for string in soup.strings:
print(repr(string))
#输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容for string in soup.stripped_strings:
print(repr(string))
from bs4 import BeautifulSoup
#<b>标签、<c>标签、<d>标签、<e>标签都是<a>标签的子标签#所以<b>标签、<c>标签、<d>标签、<e>标签可以被称为兄弟标签
soup = BeautifulSoup("<a><b>text1</b><c>text2</c><d>text3</d><e>text4</e></d></c></b></a>")
print(soup.prettify())
#<b>标签有下一个兄弟标签,存在.next_sibling 属性#<b>标签没有上一个兄弟标签,不存在 .previous_sibling 属性
print(soup.b.next_sibling)
print(soup.b.previous_sibling)
#<e>标签没有下一个兄弟标签,不存在.next_sibling 属性#<e>标签有上一个兄弟标签,存在 .previous_sibling 属性
print(soup.e.next_sibling)
print(soup.e.previous_sibling)
#注意:字符串“text1”和“text2”不是兄弟标签,因为它们的父标签不同#通过.next_siblings可以获取当前标签下面的所有兄弟标签
for sibling in soup.b.next_siblings:
print(repr(sibling))
#通过.previous_siblings可以获取当前标签上面的所有兄弟标签
for sibling in soup.e.previous_siblings:
print(repr(sibling))
#.next_element 属性指向当前被解析的对象的下一个被解析的对象
print(soup.a.next_element)
#.previous_element 属性指向当前被解析的对象的前一个被解析对象
print(soup.e.previous_element)
#通过 .next_elements 和 .previous_elements 的迭代器就可以向前或向后访问文档的解析内容,就好像文档正在被解析一样
for element in soup.a.next_elements:
print(repr(element))
for element in soup.e.previous_elements:
print(repr(element))
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head><body><pclass="title"><b>The Dormouse's story</b></p><pclass="story">Once upon a time there were three little sisters; and their names were
<ahref="http://example.com/elsie"class="sister"id="link1">Elsie</a>,
<ahref="http://example.com/lacie"class="sister"id="link2">Lacie</a> and
<ahref="http://example.com/tillie"class="sister"id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><pclass="story">...</p>
"""
soup = BeautifulSoup(html_doc)
#通过.parent查找当前标签的父标签
print(soup.title.parent)
#文档title的字符串也有父标签:<title>标签
print(soup.title.string.parent)
#文档的顶层标签比如<html>的父标签是 BeautifulSoup 对象
print(soup.html.parent)
#BeautifulSoup 对象的父标签是None
print(soup.parent)
#通过 .parents 可以递归得到标签的所有父标签
for parent in soup.a.parents:
if parent is None:
print(parent)
else:
print(parent.name)