python爬虫之BeautifulSoup4介绍
1 bs4介绍
1.1基本概念
Beautiful Soup 是⼀个可以从HTML或XML文件中提取数据的网页信息提取库
1.2 安装方法
# cmd 终端运行
pip install lxml # 先安装lxml模块才能安装bs4
pip install bs4
2 bs4的使用
2.1 快速入门
from bs4 import BeautifulSoup
# html 文档
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 获取 bs 对象
soup = BeautifulSoup(html_doc,'lxml')
# 打印文档内容 ( 把我们的标签更加规范的打印 )
print(soup.prettify())
# 获取 title 标签内容
print(soup.title) # <title>The Dormouse's story</title>
# 获取 title 标签名称
print(soup.title.name) # title
# title 标签里面的文本内容
print(soup.title.string) # The Dormouse's story
# 获取 p 段落
print(soup.p) # <p class="title"><b>The Dormouse's story</b></p>
# 寻找所有的p段落,并以列表返回
r = soup.find_all('p')
print(r)
# 结果 [<p class="title"><b>The Dormouse's story</b></p>,
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>,
# <p class="story">...</p>]
2.2 bs4的对象种类
- tag : 标签
- NavigableString : 可导航的字符串
- BeautifulSoup : bs对象
- Comment : 注释
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
print(type(soup)) # <class 'bs4.BeautifulSoup'>
print(type(soup.title)) # <class 'bs4.element.Tag'> tag : 标签
print(type(soup.a)) # <class 'bs4.element.Tag'> tag : 标签
print(type(soup.p)) # <class 'bs4.element.Tag'> tag : 标签
print(soup.p.string) # The Dormouse's story
print(type(soup.p.string)) # <class 'bs4.element.NavigableString'> # NavigableString : 可导航的字符串
html_comment = '<a><!-- 这里是注释内容--></a>'
soup = BeautifulSoup(html_comment,'lxml')
print(soup.a.string)
print(type(soup.a.string)) # <class 'bs4.element.Comment'> # <class 'bs4.element.Comment'> # Comment : 注释
title_tag = soup.p
print(title_tag) # <p class="title"><b>The Dormouse's story</b></p>
print(title_tag.name) # p
print(title_tag.string) # The Dormouse's story
3 遍历树 遍历子节点
bs里面有三种情况,第⼀个是遍历,第二个是查找,第三个是修改
3.1 contents children descendants
- contents 返回的是⼀个列表
- children 返回的是⼀个迭代器通过这个迭代器可以进行迭代
- descendants 返回的是⼀个⽣成器遍历子子孙孙
from bs4 import BeautifulSoup
'''
# contents 返回的是一个列表
# children 返回的是一个迭代器通过这个迭代器可以进行迭代
# descendants 返回的是一个生成器遍历子子孙孙
'''
html = '''
<div>
<a href='#'>百度</a>
<a href='#'>阿里</a>
<a href='#'>腾讯</a>
</div>
'''
# contents 返回的是一个列表
soup = BeautifulSoup(html,'lxml')
links = soup.contents
print(type(links)) # <class 'list'>
# print(links)
for i in links:
print(i)
结果:soup = BeautifulSoup(html,'lxml')
links = soup.contents
print(type(links)) # <class 'list'>
# print(links)
for i in links:
print(i)
# children 返回的是一个迭代器通过这个迭代器可以进行迭代
links = soup.div.children
print(type(links)) # <class 'list_iterator'>
for link in links:
print(link)
# 结果 <a href="#">百度</a>
# <a href="#">阿里</a>
# <a href="#">腾讯</a>
# descendants 返回的是一个生成器遍历子子孙孙
print(len(soup.contents))
# print(len(soup.descendants)) # TypeError: object of type 'generator' has no len()
for x in soup.descendants:
print('----------------')
print(x)
3.2 .string .strings .stripped strings
- string获取标签里面的内容
strings 返回是⼀个生成器对象以获取多个标签内容 - stripped strings 和strings基本一致, 但是它可以把多余的空格去掉
'''
# string获取标签里面的内容
# strings 返回是一个生成器对象用过来获取多个标签内容
# stripped strings 和strings基本一致 但是它可以把多余的空格去掉
'''
from bs4 import BeautifulSoup
html = '''
<div>
<a href='#'>百度</a>
<a href='#'>阿里</a>
<a href='#'>腾讯</a>
</div>
'''
soup = BeautifulSoup(html,'lxml')
a_tag = soup.a
print(a_tag) # <a href="#">百度</a>
# string获取标签里面的内容
print(a_tag.string) # 百度
print(soup.html.string) # None
# strings 返回是一个生成器对象用过来获取多个标签内容
strings = soup.strings
print(strings) # <generator object Tag._all_strings at 0x000001C767C26948>
for s in strings:
print(s) # 百度 阿里 腾讯 (存在空格)
# stripped strings 和strings基本一致 但是它可以把多余的空格去掉
strings = soup.stripped_strings
for s in strings:
print(s) # 百度 阿里 腾讯 (不存在空格)
4 遍历树 遍历父节点
parent 和 parents
- parent直接获得父节点
- parents获取所有的父节点
'''
# parent直接获得父节点
# parents获取所有的父节点
'''
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
# parent直接获得父节点
print(soup.title.parent) # title父节点是head,打印出head所有内容 <head><title>The Dormouse's story</title></head>
print(soup.head.parent) # head父节点是html,打印出html所有内容,即全部文档
print(soup.html.parent) # html父节点就是它本身,打印出html所有内容,即全部文档
# parents获取所有的父节点
print(soup.title.parents) # <generator object parents at 0x0000025F937E9678>
for x in soup.title.parents:
print(x)
print('----------------')
5 遍历树 遍历兄弟节点
- next_sibling 下⼀个兄弟结点
- previous_sibling 上⼀个兄弟结点
- next_siblings 下⼀个所有兄弟结点
- previous_siblings上⼀个所有兄弟结点
'''
# next_sibling 下一个兄弟结点
# previous_sibling 上一个兄弟结点
# next_siblings 下一个所有兄弟结点
# previous_siblings上一个所有兄弟结点
'''
from bs4 import BeautifulSoup
html = '<a><b>bbb</b><c>ccc</c><d>ddd</d><a>'
soup = BeautifulSoup(html,'lxml')
# print(soup.prettify())
print(soup.b) # <b>bbb</b>
print(soup.b.next_sibling) # 下一个兄弟结点 <c>ccc</c>
print(soup.c.previous_sibling) # 上一个兄弟结点<b>bbb</b>
print(soup.b.next_siblings) # 下一个所有兄弟结点 <generator object PageElement.next_siblings at 0x0000023636B959C8>
for i in soup.b.next_siblings:
print(i) #<c>ccc</c> <d>ddd</d>
print(soup.d.previous_siblings) # 上一个所有兄弟结点 <generator object PageElement.previous_siblings at 0x00000293D7997948>
for i in soup.d.previous_siblings:
print(i) # <c>ccc</c> <b>bbb</b>
6 搜索树
- 字符串过滤器
- 正则表达式过滤器:我们用正则表达式里面compile方法编译⼀个正则表达式,传给 find 或者 findall这个方法可以实现⼀个正则表达式的⼀个过滤器的搜索
- 列表过滤器
- True过滤器
- 方法过滤器
from bs4 import BeautifulSoup
import re
'''
# • 字符串过滤器
# • 正则表达式过滤器:用正则表达式里面compile方法编译一个正则表达式传给 find 或者 findall这个方法,
可以实现一个正则表达式的一个过滤器的搜索
# • 列表过滤器
# • True过滤器
# • 方法过滤器
'''
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
# 字符串过滤器
a_tags = soup.find_all('a')
print(a_tags) # 找到所有a标签的内容,并以列表返回
# 正则表达式过滤器
# 找所有t 打头的标签 正则表达式
print(soup.find_all(re.compile('t')))
# 列表过滤器
# 找p标签和a标签
print(soup.find_all(['p','a']))
# 找title标签和b标签
print(soup.find_all(['title','b']))
# True过滤器
print(soup.find_all(True)) # 所有标签的内容都会打印出来
# 方法过滤器
def fn(tag):
return tag.has_attr('class') # 拥有class属性的标签会被打印出来
print(soup.find_all(fn))
7 find_all() 和 find()
7.1 find_all() 和 find()
- find_all()方法以列表形式返回所有的搜索到的标签数据
- find_all()方法参数:find_all(self, name=None, attrs={}, recursive=True, text=None,limit=None, **kwargs)
name : tag 名称
attrs :标签的属性
recursive : 是否递归
text : 文本内容
limit :
限制返回的条数
**kwargs :不定长参数 以关键字来传参
- find()方法返回搜索到的第⼀条数据
from bs4 import BeautifulSoup
'''
find_all()方法以列表形式返回所有的搜索到的标签数据
find()方法返回搜索到的第⼀条数据
# find_all(self, name=None, attrs={}, recursive=True, text=None,limit=None, **kwargs)
'''
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
# find_all(self, name=None, attrs={}, recursive=True, text=None,limit=None, **kwargs)
# 常规使用:寻找所有a标签的内容
print(soup.find_all('a')) # 以列表返回 [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# name='p',attrs=title
print(soup.find_all('p','title')) # [<p class="title"><b>The Dormouse's story</b></p>]
# attrs: 属性(link1)
print(soup.find_all(id = 'link1'))
# name:标签(a) attrs: 属性(link1)
print(soup.find_all('a',limit=2))
# recursive:是否递归,一般默认为True
print(soup.find_all('a',recursive=True))
# test:文本内容
print(soup.find_all(text = 'Elsie')) # ['Elsie']
print(soup.find_all(text="The Dormouse's story")) # ["The Dormouse's story", "The Dormouse's story"]
# limit:列表打印的条数,寻找的all最大值
print(soup.find_all('a',limit=1))
# find()方法返回搜索到的第⼀条数据
print(soup.find_all('a',limit=1)[0]) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print(soup.find('a')) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
7.2 find的其他方法
- find_parents() 搜索所有父亲
- find_parrent() 搜索单个父亲
- find_next_siblings()搜索所有兄弟
- find_next_sibling()搜索单个兄弟
- find_previous_siblings() 往上搜索所有兄弟
- find_previous_sibling() 往上搜索单个兄弟
- find_all_next() 往下搜索所有元素
- find_next()往下查找单个元素
from bs4 import BeautifulSoup
'''
# find_parents() 搜索所有父亲
# find_parrent() 搜索单个父亲
# find_next_siblings()搜索所有兄弟
# find_next_sibling()搜索单个兄弟
# find_previous_siblings() 往上搜索所有兄弟
# find_previous_sibling() 往上搜索单个兄弟
# find_all_next() 往下搜索所有元素
# find_next()往下查找单个元素
'''
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
print(soup.p.find_parent())
print(soup.p.find_parents())
print(soup.a.find_next_sibling())
print(soup.a.find_next_siblings())
a_tags = soup.find(id='link3')
print(a_tags.find_previous_sibling())
print(a_tags.find_previous_siblings())
p_tag = soup.p
print(p_tag.find_all_next())
print(p_tag.find_next('a'))
8 修改文档树
- 修改tag的名称和属性
- 修改string 属性赋值,就相当于用当前的内容替代了原来的内容
- append(): 像tag中添加内容,就好像Python的列表的 .append() 方法
- decompose() : 修改删除段落,对于⼀些没有必要的文章段落我们可以给他删除掉
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
tag_p = soup.p
print(tag_p) # <p class="title"><b>The Dormouse's story</b></p>
# 1.修改tag的名称和属性
tag_p.name = 'w' # 修改名称
tag_p['class'] = 'content' # 修改属性
print(tag_p) # <w class="content"><b>The Dormouse's story</b></w>
# 2. 修改string
tag_w = soup.w
print(tag_w.string) # The Dormouse's story
tag_w.string = '修改后w标签内容'
print(tag_w.string) # 修改后w标签内容
# 3.tag.append() 方法 像tag中添加内容
print(tag_w) # <w class="content">修改后w标签内容</w>
tag_w.append('hahaha')
print(tag_w) # <w class="content">修改后w标签内容hahaha</w>
# 4.decompose() 修改删除段落
result = soup.find(id='link1')
result.decompose()
print(soup) # link1所属的 标签被删除