Python爬虫----bs4入门到精通(二)
文章目录
提示:以下是本篇文章正文内容,下面案例可供参考
一、修改文档树
修改文档树
● 修改tag的名称和属性
● 修改string 属性赋值,就相当于用当前的内容替代了原来的内容
● append() 像tag中添加内容,就好像Python的列表的 .append() 方法
● decompose() 修改删除段落,对于一些没有必要的文章段落我们可以给他删除掉
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 修改p标签
p_tag = soup.find('p')
print('修改前的', p_tag)
# 修改p标签的名称和属性
#print(p_tag.name)
#print(p_tag['class'])
# 修改名称
p_tag.name = 'new_p'
# 修改属性值
p_tag['class'] = 'new_title'
# 修改文本内容
# 修改string 属性赋值,就相当于用当前的内容替代了原来的内容
p_tag.string = 'new_string'
# 追加 像tag中添加内容,就好像Python的列表的 .append() 方法
p_tag.append('new')
print('修改后的', p_tag)
# 删除title标签
print('删除前', soup)
title_tag = soup.find('title')
title_tag.decompose()
print('删除后', soup)
二、select()方法
select()与find()的类似点 :
select_one() > find()
select() > find_all()
在select学习中,会涉及到css语法中的问题
css语法学习网站:https://www.w3school.com.cn/cssref/css_selectors.asp
案例html:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
select()与select_one()区别
select():把所有符合查找条件的数据统一以列表的形式返回
select_one():返回符合条件的第一条数据
select()方法使用
1、获取所有的a标签
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 1、获取所有的a标签
# select_one() 返回符合条件的第一条数据
# print(soup.select_one('a')) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# select 把所有符合查找条件的数据统一以列表的形式返回
# print(soup.select('a'))
2、找到所有class="sister"的标签
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 2、找到所有class="sister"的标签
# TypeError: select() missing 1 required positional argument: 'selector'
# sister_list = soup.select(class_="sister")
# 选择 class="intro"的所有元素----> .intro
# 选择 class_="sister"的所有元素----> .sister
sister_list = soup.select('.sister')
print(sister_list)
3、定位到id="link1"的元素
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 3、定位到id="link1"的元素
# 选择id="firstname"的元素---> #firstname
firstname_list = soup.select('#link1')
print(firstname_list)
4、获取title的文本数据
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
title_tag = soup.select('title')[0]
# string或者get_text()都可以获取文本数据
print(title_tag.string) # The Dormouse's story
print(title_tag.get_text()) # The Dormouse's story
# print(title_tag)
select()方法----案例实战一下
案例html:
from bs4 import BeautifulSoup
html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
<tbody>
<tr class="h">
<td class="l" width="374">职位名称</td>
<td>职位类别</td>
<td>人数</td>
<td>地点</td>
<td>发布时间</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
<td>技术类</td>
<td>2</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师(深圳)</a></td>
<td>技术类</td>
<td>2</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-25</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师(深圳)</a></td>
<td>技术类</td>
<td>4</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="even">
<td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
<tr class="odd">
<td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师(深圳)</a></td>
<td>技术类</td>
<td>1</td>
<td>深圳</td>
<td>2017-11-24</td>
</tr>
</tbody>
</table>
"""
soup = BeautifulSoup(html, 'lxml')
# 1、获取所有的tr标签
# tr_list = soup.select('tr')
# for tr in tr_list:
# print(tr)
# print('*'*75)
# 2、获取第二个tr标签
# tr_list = soup.select('tr')
# print(tr_list[1])
# 3、获取所有class="odd"的元素
# 方式一
# odd_list = soup.select('.odd')
# 方式二
# odd_list = soup.select('tr[class="odd"]')
# print(odd_list)
# 4、获取所有的a标签里面的href属性值
# a_list = soup.select('a')
# for a in a_list:
# # print(a['href])
# print(a.get('href'))
# 5、获取所有的岗位信息(获取岗位名称) 一个岗位信息是放在一个tr标签里面的 需要过滤掉第一个tr
# [1:] 过滤第一个数据
tr_list = soup.select('tr')[1:]
for tr in tr_list:
# print(tr)
# td_list = tr.select('td')
# print(td_list[0].string)
# info = list(tr.strings)
info = list(tr.stripped_strings)
print(info[0])
实战案例(天气网)
实战网址:天气网
需求:爬取中国天气网 所有城市以及最低温度 并且通过csv保存数据
页面分析
1、网站内容是由静态加载出来的 直接确定目标url:“http://www.weather.com.cn/textFC/hb.shtml”
只能借助elements分析数据 最终还是以网页源码为主
2、通过elements分析,发现一个tr标签对应一个城市的天气数据
3、tbody在网页源码中是不存在的,是在后期渲染的产物。进而去上一级再寻找table进行查找,是可行的。
页面的所有城市天气数据都是存放在class="conMidtab"的div标签里面的
4、翻页:
http://www.weather.com.cn/textFC/hb.shtml 华北 http://www.weather.com.cn/textFC/db.shtml 东北 http://www.weather.com.cn/textFC/hd.shtml 华东
url_list = [http://www.weather.com.cn/textFC/hb.shtml, http://www.weather.com.cn/textFC/db.shtml, http://www.weather.com.cn/textFC/hd.shtml…]
[hb, db, hd ,…]
for url in url_list:
分别处理每一个区域的数据
逻辑整理
1、每一个url对应一个区域的天气数据
2、我们先找到整页的数据 class="conMidtab"的div标签中
再去找每一个省或者是直辖市所有对应的table标签
再去table标签里面找tr标签 每一个tr标签存放的是一个城市的数据 需要把前两个tr标签过滤(表头)
最后去tr中找td标签 第一个td标签是城市名字 倒数第二个td标签是最低温
3、把数据保存到csv文件中 [{城市:北京, temp:1}, {},{}]
代码实现
import requests
from bs4 import BeautifulSoup
import csv
# 需求:爬取中国天气网 所有城市以及最低温度 并且通过csv保存数据
'''
requests 发送请求
bs4 解析数据
csv 进行保存数据
'''
class TQw(object):
def __init__(self):
self.header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/97.0.4692.99 Safari/537.36 "
}
self.head = ['城市', '最低温']
self.data_list = []
def get_html(self, url, header):
res = requests.get(url, header)
html = res.content.decode('utf-8')
# print(html)
return html
def parse_data(self, html):
# 我们发现港澳台网页的数据 出现标签不全的情况 影响数据爬取
# 所以我们采用'html5lib'能够实现自动补全 #缺点:速度较慢
soup = BeautifulSoup(html, 'html5lib')
# 2.1我们先找到整页的数据 class="conMidtab"的div标签中
conMidtab = soup.find(class_="conMidtab")
# 2.2再去找每一个省或者是直辖市所有对应的table标签
table_list = conMidtab.find_all('table')
for tb in table_list:
# 2.3再去table标签里面找tr标签
# 每一个tr标签存放的是一个城市的数据
# 需要把前两个tr标签过滤(表头)
tr_list = tb.find_all('tr')[2:]
for index, tr in enumerate(tr_list):
item = {}
if index == 0:
city_td = tr.find_all('td')[1]
else:
city_td = tr.find_all('td')[0]
# 2.4最后去tr中找td标签
# 第一个td标签是城市名字
# 倒数第二个td标签是最低温
# 在获取城市名字的时候,我们发现,表格中的第一个城市(tr标签)需要从第一个td中获取城市名字
# 除此之外的城市需要从第0个td中获取城市名字
# city_td = tr.find_all('td')[0]
# 去第0个td标签中获取城市名字
city_name = list(city_td.stripped_strings)[0]
# 倒数第二个td标签是最低温
temp_td = tr.find_all('td')[-2]
city_temp = list(temp_td.stripped_strings)[0]
# print(city_name, city_temp)
item['城市'] = city_name
item['最低温'] = city_temp
self.data_list.append(item)
def save_data(self):
with open('TQ.csv', 'w', encoding='utf-8', newline='') as file_obj:
DictW = csv.DictWriter(file_obj, self.head)
DictW.writeheader()
DictW.writerows(self.data_list)
def main(self):
# 每一个url对应一个区域的天气数据
# http: // www.weather.com.cn / textFC / hb.shtml
# http: // www.weather.com.cn / textFC / db.shtml
# http: // www.weather.com.cn / textFC / hd.shtml
# http: // www.weather.com.cn / textFC / hz.shtml
# http: // www.weather.com.cn / textFC / hn.shtml
# http: // www.weather.com.cn / textFC / xb.shtml
# http: // www.weather.com.cn / textFC / xn.shtml
# http: // www.weather.com.cn / textFC / gat.shtml
# url = "http://www.weather.com.cn/textFC/hb.shtml"
url_list = ['http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml', ]
# header = {
# "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
# "Chrome/97.0.4692.99 Safari/537.36 "
# }
for url in url_list:
# 获取网页源码
html = self.get_html(url, self.header)
# 解析网页源码
self.parse_data(html)
self.save_data()
if __name__ == '__main__':
tq = TQw()
tq.main()