Python学习之旅
Python核心编程爬虫篇2021.04.07
指导教师:逻辑教育讲师Jerry
一、简介
- 1、
bs4
概念
bs4
:BeautifulSoup4
美味的汤,取自刘易斯·卡罗尔在《爱丽丝梦游仙境》里的同名诗歌,像诗歌中的说法一样,BeautifulSoup 化平淡为神奇。爬虫中,他是一个可以对HTML或XML文件进行数据处理的网页信息提取库。 - 2、如何学习
①我们可以在https://github.com/中下载bs4
的源码来分析学习。
②使用前需要安装:pip install lxml
,pip install bs4
。
其中,lxml
为第三方解析器lxml HTML
。beautifulsoup
另外还支持的解析器有Python标准库中的HTML
解析器、html5lib
、lxml XML
。使用lxml
作为解析器,效率更高更稳定。 - 3、对象种类
Beautiful Soup
将HTML文档转换成一个树形结构,其中每个节点都是一个Python对象,可以归纳为4类:Tag
标签,NavigableString
可导航字符串,BeautifulSoup
bs对象,Comment
注释
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head><!--同学们新年快乐,牛年大吉-->
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml') # 获取beautifulsoup对象
# print(soup.prettify()) # 按标准缩进格式输出
# BeautifulSoup对象种类
print(type(soup)) # <class 'bs4.BeautifulSoup'> bs对象
print(soup.title) # <title>The Dormouse's story</title> 输出第一个title标签
print(type(soup.title)) # <class 'bs4.element.Tag'> Tag对象
print(soup.title.string) # The Dormouse's story 输出title标签内文本
print(type(soup.title.string)) # <class 'bs4.element.NavigableString'> 可导航的字符串
print(soup.p) # <p class="title"><b>The Dormouse's story</b></p> 输出第一个p标签
print(soup.a) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> 输出第一个a标签
html = '<a><!--同学们新年快乐,牛年大吉--></a>'
soup2 = BeautifulSoup(html, 'lxml')
print(soup2.string) # 输出同学们新年快乐,牛年大吉
print(type(soup2.string)) # <class 'bs4.element.Comment'> 注释,属于可导航字符串子类
print(soup.title.name) # 获取标签名称
print(soup.find_all('p')) # 获取并输出所有p标签 注意加引号
# print(len(soup.find_all('p'))) # p标签个数
"""找到所有a标签下的链接"""
a = soup.find_all('a') # 获取所有a标签,返回列表,'a'代表的是字符串过滤器
# print(a)
for link in a: # 遍历
print(link.get('href'))
二、使用
bs
可以对其对象文档中的内容进行遍历、查找、修改操作
1.遍历文档
1.1遍历子节点
contents
:返回的是⼀个所有⼦节点的列表children
:返回的是⼀个⼦节点的迭代器descendants
:返回的是⼀个⽣成器,可以遍历⼦⼦孙孙string
:获取标签⾥⾯的内容strings
:返回是⼀个⽣成器对象⽤过来获取多个标签内容stripped_strings
:和strings
基本⼀致 但是它可以把多余的空格去掉
1.2遍历父节点
parent
:直接获得⽗节点parents
:获取所有的⽗节点,返回生成器对象
1.3遍历兄弟节点
next_sibling
:下⼀个兄弟结点previous_sibling
:上⼀个兄弟结点next_siblings
:下⼀个所有兄弟结点previous_siblings
:上⼀个所有兄弟结点
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml') # 解析源码为BeautifulSoup对象
"""子节点"""
head_tag = soup.head # 获取第一个head标签
print(head_tag) # <head><title>The Dormouse's story</title></head>
print(head_tag.contents) # 返回所有字节点的列表[<title>The Dormouse's story</title>]
i = head_tag.children # 返回字节点迭代器,通过next访问或for遍历,访问超出范围会报错
print(i) # <list_iterator object at 0x0000000002EA4C50>
print(next(i)) # <title>The Dormouse's story</title>
# print(next(i)) # 报错
g = head_tag.descendants # 返回子节点生成器
print(g)
print(next(g)) # <title>The Dormouse's story</title>
print(next(g)) # The Dormouse's story
# print(next(g)) # 报错
print(head_tag.string) # 获取标签内的内容The Dormouse's story
html_tag = soup.html
print(html_tag)
print(html_tag.string) # None
h_g = html_tag.strings # 返回是⼀个⽣成器对象⽤来获取多个标签内容
print(h_g)
for i in h_g:
print(i)
h_g1 = html_tag.stripped_strings # 把多余空格去掉
for i in h_g1:
print(i)
"""父节点"""
title_tag = soup.title
print(title_tag.parent) # 获取父节点<head><title>The Dormouse's story</title></head>
tg = title_tag.parents # 获取所有父节点,返回一个生成器
print(tg) # <generator object parents at 0x0000000002EC11A8>
for i in tg: # 遍历
print(i)
print('*' * 15)
print('*' * 15)
"""兄弟节点"""
html_doc1 = '<a><b>bbb</b><c>ccc</c></a>'
soup1 = BeautifulSoup(html_doc1, 'lxml')
b_tag = soup1.b
print(b_tag.next_sibling) # 获取下一个兄弟节点
c_tag = soup1.c
print(c_tag.previous_sibling) # 获取下一个兄弟节点
2.查找
2.1 find()
- 返回搜索到的第一条数据
2.2 find_all()
- 以列表形式返回所有的搜索到的标签数据
find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
name
:标签名称attrs
:属性名称recursive
:是否递归搜索text
:标签内容limit
:返回数量kwargs
:关键字参数
from bs4 import BeautifulSoup
html_doc1 = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc1, 'lxml')
a_tag = soup.find('a') # 获取第一个a标签,'a'代表字符串过滤器
print(a_tag)
print(soup.find_all('a')) # 获取所有a标签,以列表形式返回
print(soup.find_all(['p', 'a'])) # 列表过滤器
print(soup.find_all(['title', 'b']))
html_doc2 = """
<table class="tablelist" cellpadding="0" cellspacing="0">
<tbody>
<tr class="h">
<td class="l" width="374">职位名称</td>
<td>职位类别</td>
<td>人数</td>
<td>地点</td>
<td>发布时间</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
<td>技术类</td>
<td>2</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师(深圳)</a></td>
<td>技术类</td>
<td>2</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师(深圳)</a></td>
<td>技术类</td>
<td>4</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
</tbody>
</table>
"""
soup2 = BeautifulSoup(html_doc2, 'lxml')
# 1 获取所有的tr标签
print(soup2.find_all('tr'))
# 2 获取第二个tr标签
print(soup2.find_all('tr')[1])
print(soup2.find_all('tr', limit=2)[1])
# print('*'*80)
# print('*'*80)
# 3 获取所有class等于even的tr标签
tr_tag = soup2.find_all('tr', class_='even')
print(tr_tag)
# 4 将所有id等于test class等于test的a标签提取出来
a = soup2.find_all('a', id='test', class_='test')
print(a)
# 5 获取所有a标签的href属性
a1 = soup2.find_all('a')
for a in a1:
# href = a['href']
href = a.get('href')
print(href)
# 6 获取所有的职位名称(文本)
trs = soup2.find_all('tr')[1:] # 获取所有tr标签,从第二个开始
for tr in trs:
td = tr.find_all('td')[0] # 找tr里的第一个td标签
job = td.string # 获取td标签里的文本
print(job)
2.3 select()
通过css选择器也可以提取数据,其语法:https://www.w3school.com.cn/cssref/css_selectors.asp,所提取的数据以列表返回
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 1 通过标签查找
print(soup.select('a')) # 以列表形式输出所有a标签
# 2 根据类名来查找 .class(这个class表示的是这个值)返回为列表
print(soup.select('.sister'))
# 3 通过id 来查找 #id (id就是表示的是这个值)返回为列表
print(soup.select('#link1'))
# 4 组合查找
print(soup.select('p #link1')) # 选择 <p> 元素内的所有 id="link1" 的元素。
print(soup.select('head>title')) # 选择父元素是 <head> 的所有 <title> 元素
# 5 获取文本数据
print(soup.select('title')[0].get_text()) # 获取第一个title中的文本数据
html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
<tbody>
<tr class="h">
<td class="l" width="374">职位名称</td>
<td>职位类别</td>
<td>人数</td>
<td>地点</td>
<td>发布时间</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
<td>技术类</td>
<td>2</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师(深圳)</a></td>
<td>技术类</td>
<td>2</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师(深圳)</a></td>
<td>技术类</td>
<td>4</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
</tbody>
</table>
"""
soup1 = BeautifulSoup(html, 'lxml')
# 1 获取所有的tr
print(soup1.select('tr'))
# 2 获取第二个tr标签
print(soup1.select('tr')[1])
# 3 获取所有class等于even的tr标签
print(soup1.select('tr.even'))
# 4 获取所有a标签的href属性
a_tag = soup1.select('a')
print(len(a_tag))
for a in a_tag:
print(a.get('href'))
# 5 获取所有的职位名称
a_tag1 = soup1.select('a')
for a in a_tag1:
print(a.string)
3.修改
对文档的修改操作包括对其添加和删除操作。
"""
修改tag的名称和属性
修改string 属性赋值,就相当于用当前的内容替代了原来的内容
append() 像tag中添加内容,就好像Python的列表的 .append() 方法
decompose() 修改删除段落,对于一些没有必要的文章段落我们可以给他删除掉
"""
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
p_tag = soup.p # 获取第一个p标签
print(p_tag) # <p class="title"><b>The Dormouse's story</b></p>
p_tag['class'] = 'hello' # 修改class属性值
p_tag.name = 'w' # 修改标签名称
p_tag.string = 'hello word' # 修改标签内容 b标签没有了
p_tag.append('Hi') # 添加内容Hi
print(p_tag) # <w class="hello">hello wordHi</w>
title_tag = soup.title # 获取第一个title标签
print(title_tag) # <title>The Dormouse's story</title>
title_tag.decompose() # 删除title标签
print(soup)
三、案例(爬取天气信息)
"""
爬取全国城市的天气信息,并保存到csv文件中
注意网页分析:日期与天气不在一标签
解析器:html5lib
"""
import requests
from bs4 import BeautifulSoup
import csv
import time
# 定义信息获取函数
def get_htmltest(url, a):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
res = requests.get(url, headers=headers) # 发起请求并获得响应
text = res.content.decode('utf-8')
soup = BeautifulSoup(text, 'html5lib') # 创建soup对象
div = soup.find('div', class_='contentboxTab2') # 获取class='contentboxTab2'的div标签
data = div.find('ul', class_='day_tabs') # 获取class='day_tabs'的ul标签 日期
day = data.find_all('li') # 获取所有li标签
day_li = [] # 储存日期信息
for i in day:
day_li.append(i.string)
hanml = div.find('div', class_='hanml') # 获取class='hanml'的div标签
conMidtabs = hanml.find_all('div', class_='conMidtab') # 获取class='conMidtab'的div标签
weathers = [] # 创建空列表存储天气信息
i = 0 # 日期标记
for conMidtab in conMidtabs:
tables = conMidtab.find_all('table') # 获取所有的table标签
i += 1 # 日期标记
for table in tables:
trs = table.find_all('tr')[2:] # 获取tr标签,天气信息保存在tr中
# print(trs)
for index, tr in enumerate(trs): # enumerate()这个方法 返回2个值 一个是值另一个是这个值所对应的索引
weather = {} # 创建字典存储每个城市天气
tds = tr.find_all('td') # 查找所有td标签,第一个tr标签中的城市索引为1
# 判断该tr标签为第几个
# print(tds)
if index == 0:
weather['城市'] = tds[1].find('a').string # 获取城市名
else:
weather['城市'] = tds[0].find('a').string
weather['白天天气'] = tr.find('td', width='89').string # 获取白天天气状况
weather['白天风向'] = tr.find('td', width='162').contents[1].string # 获取白天风向
weather['白天风力'] = tr.find('td', width='162').contents[3].string # 获取白天风力
weather['最高气温'] = tr.find('td', width='92').string # 获取最高气温
weather['夜间天气'] = tr.find('td', width='89').string # 获取夜间天气状况
weather['夜间风向'] = tr.find('td', width='177').contents[1].string # 获取夜间风向
weather['夜间风力'] = tr.find('td', width='177').contents[3].string # 获取夜间风力
weather['最低温度'] = tr.find('td', width='86').string # 获取最低气温
weather['日期'] = day_li[i - 1] # 字典添加日期
weather['地区'] = a # 地区
weathers.append(weather) # 将城市信息添加到列表中
return weathers
# 定义csv文件表头写入函数
def filehead_fun(headers):
with open('weather.csv', 'w', encoding='utf-8', newline='') as file_obj2: # 添加newline可使文件中不带空行
wri = csv.DictWriter(file_obj2, headers) # 创建对象
wri.writeheader() # 写入表头
# 定义csv文件写入函数
def file_fun(weathers, heads):
# csv文件写入内容,需写入多页,所以用追加方式
with open('weather.csv', 'a', encoding='utf-8', newline='') as file_obj:
writer = csv.DictWriter(file_obj, heads) # 创建对象
writer.writerows(weathers) # 写入内容
def main():
urls = ['hb', 'db', 'hd', 'hz', 'hn', 'xb', 'xn', 'gat']
areas1 = {'hb': '华北', 'db': '东北',
'hd': '华东', 'hz': '华中',
'hn': '华南', 'xb': '西北',
'xn': '西南', 'gat': '港澳台'} # 地区
headers = ['城市', '白天天气', '白天风向', '白天风力', '最高气温', '夜间天气', '夜间风向', '夜间风力', '最低温度', '日期', '地区'] # 表头
# areas = ['hb']
filehead_fun(headers)
# 更换地区
for area in urls:
url = f'http://www.weather.com.cn/textFC/{area}.shtml'
a = areas1[area]
# print(url)
weathers = get_htmltest(url, a) # 调用信息获取函数
file_fun(weathers, headers)
time.sleep(0.5)
if __name__ == '__main__':
main()