1.首先,了解虾米的一些反爬措施(虾米的反爬已经是很不错的了,但是嘿嘿),了解re,requests,beautifulsoup, selenium自动化脚本等等一系列该有的知识。开始干活
更新(目前虾米在歌曲的id获取上采取了加密措施,因此无法使用,但是通过抓包发现虾米只是在id上进行了加密,其他部分并未做出改变,所以,只需解密歌曲id依然可以下载歌曲。且不要下载我上传的虾米音乐爬取exe,已经无法下载音乐2019-03-22)
2.源码
Config.py
伪装成浏览器,否则的话会获得虾米音乐赠与的400 bad request
# DB
# DB_HOST = '192.168.153.131'
# DB_PORT = 3306
# DB_DBNAME = 'spider'
# DB_USER = 'root'
# DB_PASSWORD = '123123'
# DB_CHARSET = 'utf8mb4'
# User-Agents
FakeUserAgents = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12",
"Opera/9.27 (Windows NT 5.2; U; zh-cn)",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13",
"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ",
"Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"
]
3.核心源码爬取歌手id 获取location 并解码 下载
# 1.搜索歌曲,获取歌曲id
# 'https://www.xiami.com/search?key='+ keyword+"&pos=1"
# 2.xml页面,可查看网页代码,下载xml文件,并将文件拖入到指定的窗口获得文件路径
# 'https://www.xiami.com/widget/xml-single/uid/0/sid/1804846198'+id
# 3.获取location
# 4.下载音乐
# 5. java读取文件夹
# 网页播放页面
# 'https://www.xiami.com/play?ids=/song/playlist/id/'+id+'/object_name/default/object_id/0#loaded'
# 歌曲详细信息页面
# https://www.xiami.com/song/1796063337
# 实现xml文件下载
import math
import os
from random import choice
import re
import sys
import urllib.parse
from bs4 import BeautifulSoup
import chardet # 需要导入这个模块,检测编码格式
import requests
import Configure
# purpose : University competition
# author : Comiii
# date : 2018/12/28
class Mp3Spider:
header = {'user-agent': choice(Configure.FakeUserAgents)}
localUrl = os.path.realpath(__file__)[:-7]
songs_name = []
location = []
SongUrls = {}
flag = "1"
def __init__(self, flag, url):
# print("获取当前文件路径——" + os.path.realpath(__file__)) # 获取当前文件路径
# 美国iTunes榜
# url = 'https://www.xiami.com/billboard/328'
self.flag = flag
# 103
# url = "https://www.xiami.com/billboard/103"
r = requests.get(url, headers=self.header)
html = r.content
soup = BeautifulSoup(html, "html.parser")
# <div class="song-name em"><a href="/song/1802902669">Youngblood</a></div>
i = 0
for div in soup.find_all('div', {'class': 'song-name em'}):
for a in div.find_all('a'):
# print(a.text+" "+a.get('href')[6:])
if i < 12:
self.SingerId = a.get('href')[6:]
self.GetLocation()
else:
break
i += 1
# print(i)
def GetLocation(self):
url = "https://www.xiami.com/widget/xml-single/uid/0/sid/" + self.SingerId
r = requests.get(url, headers=self.header)
xml = r.content
encode_type = chardet.detect(xml)
xml = xml.decode(encode_type['encoding'])
# <location><![CDATA[(.*?)]]></location>
url = re.compile(r'<location><!\[CDATA\[(.*?)\]\]></location>', re.S)
song_name = re.compile(r'<song_name><!\[CDATA\[(.*?)\]\]></song_name>', re.S)
songs_name = re.findall(song_name, xml)
location = re.findall(url, xml)
for i in range(len(songs_name)):
if songs_name != []:
if songs_name[i] != self.SongUrls.get("song_name"):
self.SongUrls['song_name'] = songs_name[i]
self.SongUrls['Url'] = self.Decode(location[i])
if self.flag == 1:
print(self.SongUrls.get("song_name"))
elif self.flag == 2:
self.DownloadSong()
def Decode(self, location):
#核心代码已被删除,需要的人可以私聊我。
#或者下载下方python的虾米音乐下载器下载使用。
def DownloadSong(self):
if not os.path.exists("Download"):
os.makedirs("Download")
for songurl in self.SongUrls:
r = requests.get(self.SongUrls.get("Url"))
filename = "{0:s}.mp3".format(self.SongUrls.get("song_name"))
with open("Download/" + filename, 'wb') as file:
file.write(r.content)
print("Download {0:s} Successfully.".format(self.SongUrls.get("song_name")))
if __name__ == '__main__':
a = []
for i in range(1, len(sys.argv)):
a.append(sys.argv[i])
flag = int(a[0])
music_url = int(a[1])
if music_url == 0:
# 新歌榜
url = "https://www.xiami.com/billboard/102"
elif music_url == 1:
# 热歌榜
url = "https://www.xiami.com/billboard/103"
elif music_url == 2:
# 电音榜
url = "https://www.xiami.com/billboard/325"
elif music_url == 3:
# 歌单收录榜
url = "https://www.xiami.com/billboard/306"
elif music_url == 4:
# 抖音热歌榜
url = "https://www.xiami.com/billboard/332"
elif music_url == 5:
# 影视原声榜
url = "https://www.xiami.com/billboard/324"
elif music_url == 6:
# 虾米分享榜
url = "https://www.xiami.com/billboard/307"
Mp3Spider(flag, url)
5.有什么疑问可以在下方留言!
虾米音乐单曲或多曲下载-->exe 下载:https://download.csdn.net/download/qq_25233621/10949831