前言
前几天实现了爬取网易云音乐榜单,并保存到csv,今日通过request实现获取音乐文件以及对应图片,并将下载进度实时显示在控制台
提示:以下是本篇文章正文内容,下面案例可供参考
一、用到函数:
request.urlretrieve(url, filename=None, reporthook=None, data=None)
&参数说明:
url:外部或者本地url
filename:指定了保存到本地的路径(如果未指定该参数,urllib会生成一个临时文件来保存数据);
reporthook:是一个回调函数,当连接上服务器、以及相应的数据块传输完毕的时候会触发该回调。我们可以利用这个回调函数来显示当前的下载进度。
data:指post到服务器的数据。该方法返回一个包含两个元素的元组(filename, headers),filename表示保存到本地的路径,header表示服务器的响应头。
二、使用步骤
1.引入库
代码如下:
import json
import sys
import requests
from bs4 import BeautifulSoup
from urllib import request
import csv
2.读入数据
代码如下:
url = 'https://music.163.com/discover/toplist'
url_head = 'https://music.163.com'
headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
response = requests.get(url, headers=headers)
该处获取网易云榜单url请求的数据。
3.解链接,获取歌曲源链
代码如下:
def get_musics_url(urls):
"""
解析明链,获得暗链
:param urls: 输入明链
:return: 返回暗链
"""
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.141 Safari/537.36'}
response = requests.get(urls, headers=headers)
urls = response.url
return urls
4.写入数据
代码如下:
def get_music(url, name, formats, site, piace):
"""
保存图片
:param piace: 当前循环数
:param site: 保存地址
:param formats: 下载文件格式
:param url: 下载url
:param name: 下载文件名
"""
def _progress(block_num, block_size, total_size):
"""
回调函数
:param block_num: 已经下载的数据块
:param block_size: 数据块的大小
:param total_size: 远程文件的大小
"""
sys.stdout.write(
f'\r>> Downloading_{site}\t{name} \t{block_num * block_size / 1024 ** 2:5.2f}MB/{total_size / 1024 ** 2:5.2f}'
f'MB\t{(block_num * block_size / total_size * 100.0):3.0f}%\t{piace}/100')
sys.stdout.flush()
request.urlretrieve(url, f'{site}/{name}.{formats}', _progress)
通过get_music()函数获取网络文件,通过_progress()函数回调request.urlretrieve函数获取下载信息。
5.控制台截图
全部代码:
import json
import sys
import requests
from bs4 import BeautifulSoup
from urllib import request
import csv
import os
def get_music(urls, name, formats, site, piace):
"""
保存图片
:param piace: 当前循环数
:param site: 保存地址
:param formats: 下载文件格式
:param urls: 下载url
:param name: 下载文件名
"""
def _progress(block_num, block_size, total_size):
"""
回调函数
:param block_num: 已经下载的数据块
:param block_size: 数据块的大小
:param total_size: 远程文件的大小
"""
sys.stdout.write(
f'\r>> Downloading_{site}\t{name} \t{block_num * block_size / 1024 ** 2:5.2f}MB/{total_size / 1024 ** 2:5.2f}'
f'MB\t{(block_num * block_size / total_size * 100.0):3.0f}%\t{piace}/100')
sys.stdout.flush()
request.urlretrieve(get_musics_url(urls), f'{site}/{name}.{formats}', _progress)
def get_musics_url(urls):
"""
解析明链,获得暗链
:param urls: 输入明链
:return: 返回暗链
"""
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.141 Safari/537.36'}
response = requests.get(urls, headers=headers)
urls = response.url
return urls
def get_music_name():
def music_times(time):
"""
计算歌曲时间
:param time: ms时间
:return: 分:秒
"""
music_time_ = time // 1000
_time = music_time_ % 60
music_time_ //= 60
if _time <= 9:
_time = '0' + str(_time)
_time = str(_time)
music__time = str(music_time_) + ':' + _time
return music__time
url = 'https://music.163.com/discover/toplist'
url_head = 'https://music.163.com'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/87.0.4280.141 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
with open('music_message.csv', 'w', encoding='utf-8') as stream:
dict_stream = csv.DictWriter(stream, ['music_ranking', 'music_id', 'music_name', 'music_time',
'music_author', 'music_url', 'pic_url'])
data = response.text
soup = BeautifulSoup(data, 'html.parser')
comment = soup.select_one('#song-list-pre-cache')
music_dicts = comment.select_one('textarea').string
# print(music_dicts) #读取JSON数据
music_dict = json.loads(music_dicts)
items = comment.select('a')
all_comments = []
i = 0
for item in items:
music_comment = {}
music_id = music_dict[i]['id']
music_name = item.text
music_url = item.get('href')
music_time = music_dict[i]['duration']
music_author = music_dict[i]['artists'][0]['name']
pic_url = music_dict[i]['album']['picUrl']
i += 1
music_ranking = i
music_comment['music_ranking'] = music_ranking
music_comment['music_id'] = music_id
music_comment['music_name'] = music_name
music_comment['music_time'] = music_times(music_time)
music_comment['music_author'] = music_author
music_comment['music_url'] = url_head + music_url
music_comment['pic_url'] = pic_url
all_comments.append(music_comment)
dict_stream.writerows(all_comments)
print('写入完毕')
return all_comments
else:
print('网页丢失!')
return None
if __name__ == '__main__':
os.mkdir('mp3')
os.mkdir('img')
music_url_head = 'http://music.163.com/song/media/outer/url?id='
music_list = get_music_name() # list
j = 1
if music_list:
List_ = iter(music_list)
for music_message in List_:
get_music(music_message['pic_url'], music_message['music_name'], 'jpg', 'img', j)
url = get_musics_url(music_url_head + str(music_message['music_id']) + '.mp3')
get_music(url, music_message['music_name'], 'mp3', 'mp3', j)
j += 1
else:
exit('获取榜单失败!!!')