百度音乐----墨明棋妙
刚开始学习Python,试着写了些爬虫,刚开始爬取发现只能爬当前页,第二页和后面的歌曲id只能从网页看到,爬取深度应该不够,而加载第二页时地址栏的地址没有改变,观察第一页后面的XHR信息终于找到了后面页面的歌曲信息数据。
<marque>'http://music.taihe.com/data/user/getsongs?start='+str(i*15)+'&size=15&ting_uid=88012547'
#这个是最后能够爬所有歌曲的主要信息,学习中,大神绕过。分享下,给大家交流,侵联删
```javascript
import requests
import json,os
from lxml import etree
def get_html(url):
headers = {'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
response = requests.get(url, headers=headers)
response.encoding = 'gzip'
return response
def get_one_info(api_url):
response = get_html(api_url)
data = response.json()
song_name = data['songinfo']['title']
down_url = data["bitrate"]['file_link']
return song_name, down_url
def new_path(name):#当前目录下创建新的name文件夹
if os.path.exists(f'./{name}'):
pass
else:
os.mkdir(f'{name}')
return name
def down_one_music(path,song_name, down_url):
r = get_html(down_url)
with open(f'./{path}/{song_name}.mp3', 'wb')as f:
f.write(r.content)
print(f'{song_name}下载完成')
def get_list(url):
r = get_html(url)
data = r.json()
orig_data = data['data']['html']
tree = etree.HTML(orig_data)
song_names = tree.xpath('//li/div/span/a/text()')
song_ids = tree.xpath('//li/div/span/a/@href')
return song_names, song_ids
def get_list_urls(num): # 本次num=9
list_urls = []
for i in range(num):
list_url = 'http://music.taihe.com/data/user/getsongs?start=' + str(i * 15) + '&size=15&ting_uid=88012547'
list_urls.append(list_url)
# print(list_urls,len(list_urls))
return list_urls
if __name__ == '__main__':
path = new_path('墨明棋妙') #创建墨明棋妙文件夹
list_urls = get_list_urls(9)
for list_url in list_urls:
song_names, song_ids = get_list(list_url)
for song_name, song_id in zip(song_names, song_ids):
api_url = 'http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&songid=' + song_id[
6:]
song_name, down_url = get_one_info(api_url)
down_one_music(path,song_name,down_url)
print('本页下载完成')