import requests
from lxml import etree
import os
from urllib import request
# def base_response():
def list_url(queue):
# 这里没有设置页数, 只设置了固定当前页
base_url = 'https://www.ximalaya.com/xiangsheng/9723091'
headers ={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Cookie':'Hm_lvt_4a7d8ec50cfd6af753c4f8aee3425070=1534771185,1534771396,1534771412; Hm_lpvt_4a7d8ec50cfd6af753c4f8aee3425070=1534771412'
}
# 进行第一次请求
response = requests.get(base_url,headers=headers)
html = response.text
# 匹配页面中每首歌的 href路径
html_ele = etree.HTML(html)
html_url = html_ele.xpath('//*[@class="dOi2"]/li/div[2]/a/@href')
html_href = 'https://www.ximalaya.com/revision/play/tracks?trackIds='
# xpath 匹配 返回过来的是列表类型
for i in html_url:
# 截取最后一串id数字与列表页的 路径进行拼接
# 分析列表页中的路由尾部id 与 点击播放的ajax地址尾部的id一样,并且其余部分都相同
# 所以 我们就不需要再次请求到详情歌曲内再次获取了
ob = html_href + i.split('/')[-1]
html_url_src(ob,queue,headers)
# html_url = html_ele.xpath('//*[@id="root"]/main/section/div/div[2]/div[1]/div[2]/div[2]/ul/li/div[2]/a/@href')
def html_url_src(href_scr,queue,headers):
# 向 每首歌的ajax url发送请求
ob_response = requests.get(href_scr,headers=headers)
ajax_data = ob_response.json()
ajax_tracksForAudioPlay=ajax_data['data']['tracksForAudioPlay']
for x in ajax_tracksForAudioPlay:
ajax_src = x['src'] # 获取 每一首个的下载地址
ajax_name = x['trackName'] # 获取每首歌的 名称
# 存入通讯列队中
queue.put((ajax_src,ajax_name))
def download_src(src_mp4):
print(src_mp4)
if not os.path.exists('dowlond'):
os.mkdir('dowlond')
(ajax_src,ajax_name) = src_mp4
faml = 'dowlond/'+ajax_name+'.mp4'
ob = request.urlretrieve(ajax_src,faml)
if __name__ == '__main__':
from multiprocessing import Queue,Process,Pool
# 创建 通讯
q = Queue()
# 创建进程 进行获取第一次请求的url
process = Process(target=list_url,args=(q,))
process.start()
# 创建进程池进行 批量下载(最大进程数为10)
download_pool = Pool(10)
for i in range(0,340):
# 从通讯列队中获取 存入的 下载地址与名称
src_mp4 = q.get()
download_pool.apply_async(download_src,(src_mp4,))
download_pool.close()
download_pool.join()
process.join()
# list_response = requests.get(i,headers=headers)
# print(list_response)
# if __name__ == '__main__':
# ob = base_response()
# print(ob)