之前写过qq音乐的爬虫,如果想下载无损音质的音乐比较困难,如果没有会员权限始终无法获取到,最后退而求其次,只能下载普通音质。近期看了下酷狗音乐,发现其无损音质和高品质音乐的链接都可以分析出来。全网搜索了一下,找到几个大神做的项目,重新进行复刻和改善,形成了这个项目。主要是搭建Flask框架,构建一个搜索引擎。
主要思想:
根据酷狗的搜索接口以及无损音乐下载接口,做出爬虫系统。采用flask框架,前端提取搜索关键字,后端调用爬虫系统采集数据,并将数据前端呈现。
爬虫开发:
一、维护一个代理池
为什么要维护一个代理池,目的显而易见,防止IP被封。把提前爬好的代理IP统一放在一个csv文件中。使用的时候随机从中抽取就好。直接放代码:
from multiprocessing.dummy import Pool as ThreadPool
import requests
from lxml import etree
import time
import csv
import random
url = 'https://www.kuaidaili.com/free/inha/{}/'
alive_ip = []
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
reponse = requests.get(url,headers = headers)
if reponse.status_code == 200:
return reponse.text
return None
except requests.RequestException:
return None
def get_one_parse(url):
with open('IP/456.txt', 'a+') as f: # 保存在相应的文件里
print(url) # 看爬取到第几页来了
html = get_one_page(url)
html = etree.HTML(html)# 从获得的html页面中分析提取出所需要的数据
IP = html.xpath('.//*[@id="list"]/table/tbody//td[1]/text()') # 使用相对路径(//)查找所有(//*)id="list"的标签
ports = html.xpath('.//*[@id="list"]/table/tbody//td[2]/text()')# 这是 端口位置
for (ip,port) in zip(IP,ports): # 保存
ip = ip +':' + port
print("测试:{}".format(ip))
f.write(ip + '\n')
def validate(ip):
IP = {'http':ip} #指定对应的 IP 进行访问网址
try:
r = requests.get('http://trackercdn.kugou.com/i/v2/?appid=1005&pid=2&cmd=25&behavior=play', proxies=IP, timeout=3)# proxies 设定对应的代理 IP 进行访问, timeout 设定相应的时间之后停止等待响应
if r.status_code == 200:
print("成功:{}".format(ip))
alive_ip.append(ip) # 有效的 IP 则添加进去
except:
print("无效")
def save(writer):
for ip in alive_ip:
writer.writerow([ip])
print(ip)
print("成功保存所有有效 ip ")
#做一个线程池实现多线程检测爬虫的有效性
def check(writer):
with open('IP/456.txt', 'r') as f:
lines = f.readlines()
# 我们去掉lines每一项后面的\n\r之类的空格
# 生成一个新的列表!
ips = list(map(lambda x: x.strip(), [line for line in lines])) # strip() 方法用于移除字符串头尾指定的字符,默认就是空格或换行符。
pool = ThreadPool(20) # 多线程 设置并发数量!
pool.map(validate, ips) # 用 map 简捷实现 Python 程序并行化
save(writer) # 保存能用的 IP
#该函数是用于酷狗爬虫调用,在维护代理池中没用
def get_proxy():
with open('KuGou_Spid/IP/validate.csv',mode='r',encoding='utf-8',newline='') as f:
datas = f.readlines()
ran_num = random.choice(datas)
ip = ran_num.strip().split('/r')
proxies = {'http://':'http://'+ ip[0]} #不太明白为什么非得采用这种格式,但不这样不行
return proxies
def main():
csv_file = open('IP/validate.csv', 'w', newline='')
writer = csv.writer(csv_file)
writer.writerow(['ip'])
url = 'https://www.kuaidaili.com/free/inha/{}/' # 这是网站的 url
for i in range(1, 50): # 爬取四十页
time.sleep(1) # 休息 1 秒
get_one_parse(url.format(i))
check(writer)
csv_file.close()
if __name__ == '__main__':
#check()
main()
二、酷狗爬虫分析
实现酷狗的爬虫需要用到两个接口,一个是歌曲搜索的接口,一个是歌曲下载的接口。通过歌曲搜索的接口获取歌曲的HQhash和SQhash,然后再将这个hash值传递给歌曲下载的接口实现歌曲下载。
歌曲搜索接口:
http://mobilecdn.kugou.com/api/v3/search/song?format=json&keyword={}&page=1
功能实现:
# coding=utf-8
# web_request.py
import requests
import json
from KuGou_Spid import free_proxyIP
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.132 Safari/537.36',
}
proxy = free_proxyIP.get_proxy()
def parse(url):
try:
ret = json.loads(requests.get(url, headers=headers,proxies=proxy,timeout=5).text)
except Exception as e:
print(e)
else:
ret = json.loads(requests.get(url, headers=headers,timeout=5).text)
# 返回的是已经转换过后的字典数据
return ret
if __name__ == '__main__':
parse()
# Music_Search.py
import copy
from KuGou_Spid import web_request
def search(keyword):
search_url = 'http://mobilecdn.kugou.com/api/v3/search/song?format=json&keyword={}&page=1'.format(keyword)
# 这里需要判断一下,ip与搜索字段可能会限制搜索,total进行判断
total = web_request.parse(search_url)['data']['total']
if total != 0:
search_total_url = search_url + '&pagesize=%d' % total
music_list = web_request.parse(search_total_url)['data']['info'] #这是字典结构
item, items = {}, []
for music in music_list:
if music['sqhash'] != '0'*32:
item['Song'] = music['songname_original'] # 歌名
item['Singer'] = music['singername'] # 歌手
item['SQHash'] = music['sqhash'] # 歌曲无损hash
item['HQHash'] = music['hash'] # 歌曲高清hash
item['Size'] = str(round((music['sqfilesize']/1024)/1024,2))+'M'
item['minute'] = str(int(music['duration']/60))
if(music['duration']%60<10):
item['second'] = '0'+str(music['duration'] % 60)
else:
item['second'] = str(music['duration'] % 60)
items.append(copy.deepcopy(item))
return items
else:
return None
if __name__ == '__main__':
search()
歌曲下载接口:
Music_api_1 = 'http://trackercdnbj.kugou.com/i/v2/?cmd=23&pid=1&behavior=download'
# V2版系统,手机版,加密方式为md5(hash +"kgcloudv2") (备用)
Music_api_2 = 'http://trackercdn.kugou.com/i/v2/?appid=1005&pid=2&cmd=25&behavior=play'
# 老版系统,加密方式为md5(hash +"kgcloud")(备用)
Music_api_3 = 'http://trackercdn.kugou.com/i/?cmd=4&pid=1&forceDown=0&vip=1'
功能实现:
# coding=utf-8
# Musit_Download.py
import copy
import hashlib
from KuGou_Spid import web_request
from KuGou_Spid import Music_Search
# V2版系统,pc版,加密方式为md5(hash +"kgcloudv2")
Music_api_1 = 'http://trackercdnbj.kugou.com/i/v2/?cmd=23&pid=1&behavior=download'
# V2版系统,手机版,加密方式为md5(hash +"kgcloudv2") (备用)
Music_api_2 = 'http://trackercdn.kugou.com/i/v2/?appid=1005&pid=2&cmd=25&behavior=play'
# 老版系统,加密方式为md5(hash +"kgcloud")(备用)
Music_api_3 = 'http://trackercdn.kugou.com/i/?cmd=4&pid=1&forceDown=0&vip=1'
def V2Md5(Hash): # 用于生成key,适用于V2版酷狗系统
return hashlib.md5((Hash + 'kgcloudv2').encode("utf-8")).hexdigest()
def Md5(Hash): # 用于老版酷狗系统
return hashlib.md5((Hash + 'kgcloud').encode("utf-8")).hexdigest()
def HighSearch(keyword):
music_list = Music_Search.search(keyword)
if music_list is not None:
item, items = {}, []
for music in music_list:
SQHash = str.lower(music['SQHash'])
HQHash = str.lower(music['HQHash'])
key_new_SQ = V2Md5(SQHash) # 生成v2系统key
key_new_HQ = V2Md5(HQHash)
try:
DownUrl = web_request.parse(Music_api_2 + '&hash=%s&key=%s' % (SQHash, key_new_SQ))
if DownUrl['status'] == 2:
DownUrl = web_request.parse(Music_api_2 + '&hash=%s&key=%s' % (HQHash, key_new_HQ))
item['Song'] = music['Song'] # 歌名
item['Singer'] = music['Singer'] # 歌手
item['url'] = DownUrl['url'][0]
item['Size'] = music['Size']
item['minute'] = music['minute']
item['second'] = music['second']
items.append(copy.deepcopy(item))
except KeyError:
pass
return items
if __name__ == '__main__':
HighSearch()
二、Flask框架
Flask是轻量级框架,将前端的搜索关键字传递给后台爬虫。将搜索结果以list.html呈现。
# coding=utf-8
# app.py
import sys
from flask import Flask
from flask_cors import *
from imp import reload
#导入render_template
from flask import request,render_template
from KuGou_Spid import Music_download
reload(sys)
app = Flask(__name__)
CORS(app)
#修饰器实现路由
#路由127.0.0.1.5000
@app.route('/', methods=['GET', 'POST'])
def search():
if request.method == 'GET':
return render_template('search.html')
elif request.method == 'POST':
keyword = request.form.get('keyword')
items = Music_download.HighSearch(keyword)
if items != None:
return render_template('list.html', list=items)
else:
return '找不到!!!不支持英文'
else:
return render_template('404.html')
if __name__ == '__main__':
app.run(debug=True)
# free_proxyIP.main()