from bs4 import BeautifulSoup
# 前面几个方法使用的都是这个参数,所以统一使用这个(后面的那些方法没有引用这个html文本文件)
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
'''
关联选择demo06--02--下级节点
使用children属性进行获取--获取子节点
'''defdemo06():
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,"lxml")# 结果:<list_iterator object at 0x000002B35915BFA0print(soup.p.children)# 结果:[# '\n Once upon a time there were three little sisters; and their names were\n ',# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,# ',\n ',# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,# ' and\n ',# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,# ';\n and they lived at the bottom of a well.\n '# ]print(list(soup.p.children))for item in soup.p.children:print(item)
3、获取子孙节点–descendants
'''
关联选择demo07--03--下级节点
使用descendants属性进行获取--获取子孙节点(获取:子节点和孙节点的内容)
'''defdemo07():
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span>Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,"lxml")# 结果:<generator object Tag.descendants at 0x000001C0E79DCC10>print(soup.p.descendants)# 结果:[# 'Once upon a time there were three little sisters; and their names were\n ',# <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span>Elsie</a>,# <span>Elsie</span>,# 'Elsie',# 'Elsie',# ',\n ',# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,# 'Lacie',# ' and\n ',# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,# 'Tillie',# ';\n and they lived at the bottom of a well.'# ]print(list(soup.p.descendants))# for item in soup.p.descendants:# print(item)
4、获取父节点–parent、祖先节点–parents
'''
关联选择demo08--01--上级节点
使用parent属性进行获取--获取父节点
使用parents属性进行获取--获取祖先节点
'''defdemo08():
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<p>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
</p>
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,"lxml")# 会打印出<body>标签中所有的内容,包括子节点p标签和孙节点a标签等全部的值print(soup.p.parent)# 获取第一个a标签的父节点p标签的值,包括当前的这个a标签中的文本内容print(soup.a.parent)print("=======================")# 结果:<generator object PageElement.parents at 0x000001403E6ECC10>print(soup.a.parents)for i, parent inenumerate(soup.a.parents):print(i, parent)
5、获取兄弟节点
'''
关联选择demo09--兄弟节点
# 可以使用的属性有:
1、next_sibling
2、previous_sibling
3、next_siblings
4、previous_siblings
'''defdemo09():
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>hello
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
<a href="http://example.com/a" class="sister" id="link3">a</a>
<a href="http://example.com/b" class="sister" id="link3">b</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,"lxml")# 1、使用next_sibling# 结果:helloprint(soup.a.next_sibling)# 2、使用next_siblings# 结果:<generator object PageElement.next_siblings at 0x00000241CA26CC10>print(soup.a.next_siblings)# print(list(soup.a.next_siblings))# 3、使用previous_sibling# 结果:Once upon a time there were three little sisters; and their names wereprint(soup.a.previous_sibling)# 4、使用previous_siblings# <generator object PageElement.previous_siblings at 0x000001F4E6E6CBA0>print(soup.a.previous_siblings)# print(list(soup.a.previous_siblings))
6、方法选择器
1、find_all()
'''
方法选择器 -- find_all() -- 以列表形式返回多个元素
find_all(name, attrs={}, recursive=True, string, limit)
# 1、name: 标签的名称--查找标签
# 2、attrs: 属性过滤器字典
# 3、recursive: 递归查找一个元素的子孙元素们,默认为True
# 4、string:查找文本
# 5、limit: 查找结果的个数限制
'''defdemo10():
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,"lxml")# 1、【基本使用】找到所有的a标签# 结果:[# <a class="sister hi" href="http://example.com/elsie" id="link1">Elsie</a>,# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a># ]print(soup.find_all("a"))# for item in soup.find_all("a"):# print(item.string)# 2、【属性查找】根据指定的属性字典进行元素的查找,这里查找的是class为sister的元素print(soup.find_all(attrs={"class":"sister"}))# 效果同上print(soup.find_all(class_ ="sister"))# ============这个没有找到结果,需找到原因============print(soup.find_all(class_ ="hi"))# 3、【文本查找】查找文本为Elsie的内容print(soup.find_all(string="Elsie"))
2、find()
'''
方法选择器 -- find() -- 返回单个元素【一般是返回第一个元素作为结果】
'''defdemo11():
html_doc ="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>,
<a href="http://example.com/lacie" class="sister" id="link2"><span>Lacie</span></a> and
<a href="http://example.com/tillie" class="sister" id="link3"><span>Tillie</span></a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,"lxml")# 结果:<a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>print(soup.find("a"))