*16-3-27更新,原方法已失效,仅供参考
实现思路是这样的:
1.访问songer的所有作品列表页,例如:
http://5sing.kugou.com/marblue/fc/1.html
显示的是songer的翻唱作品第一页,修改链接访问所有作品列表页即可。
2.正则表达式抓取作品id号与歌曲名字。
3.以id为14041908的翻唱作品为例,利用id号与歌曲类型生成链接:
http://5sing.kugou.com/fc/DownFile.aspx?SongID=14041908&SongType=fc
4.利用requests模块模拟登陆5sing网站,访问3生成的链接,得到重定向链接,如3例子中的作品,重定向后链接是
http://data1.5sing.kgimg.com/T1EZZPB4YT1R47IVrK.mp3
5.利用4得到的链接与urllib.urlretrieve即可下载歌曲至本地,歌曲名字可以用先前2抓取的名字命名。
ps.由于这是单线程爬虫,下载起来很慢,于是我想了个高效的使用该爬虫的方法,即生成两个表:下载链接表,下载文件名与歌曲名字对应表。利用迅雷读取下载链接表即可。因为下载下来的文件都是如T1EZZPB4YT1R47IVrK.mp3一类的形式,因此可以利用批处理命令与对应表修改mp3文件名便大功告成。
这里以歌手“三无marbule”为例:
#coding:utf-8
import requests
import urllib
import urllib2
import re
import os
def redirect(songid , songtype):
params = {
'SongID' : str(songid) , 'SongType' : str(songtype)}
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip,deflate,sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
# 'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'niceScrollTop=0; _ga=GA1.2.68528838.1409202005; bdshare_firstime=1416837992203; wsp_volume=0.2; wsp_ismuted=0; liveWindow=1; CNZZDATA4559556=cnzz_eid%3D144953663-1417489637-http%253A%252F%252Fopenapi.qzone.qq.com%252F%26ntime%3D1428148526; PHPSESSID=euhdsfqlb5lgqkvd36tn2gt337; 5sing_ssid=euhdsfqlb5lgqkvd36tn2gt337; CNZZDATA5758545=cnzz_eid%3D1849251994-1416835424-http%253A%252F%252F5sing.kugou.com%252F%26ntime%3D1429798455; 5sing_auth=9wyi1oEy05Th8rSQwAXb+K61FrHwZfsPNHoltUMCB+BYjcohtSdq+w==; 5sing_user_info=a%3A3%3A%7Bs%3A7%3A%22wsingId%22%3Bi%3A33169395%3Bs%3A8%3A%22username%22%3Bs%3A9%3A%22%E4%B9%B1%E5%BD%B1%E7%83%9B%22%3Bs%3A6%3A%22avatar%22%3Bs%3A53%3A%22http%3A%2F%2Fimg10.5sing.kgimg.com%2Fm%2FT1MUEPB4AT1RXrhCrK.jpg%22%3B%7D; Anonymous=68ebeb3fff0b4cde817d60830d9b1a7f; area=%E5%85%A8%E9%83%A8; CNZZDATA983423=cnzz_eid%3D1505656430-1414072487-%26ntime%3D1429963731',
'Host':'5sing.kugou.com',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 SE 2.X MetaSr 1.0'
}
url = 'http://5sing.kugou.com/'+ str(songtype) + '/DownFile.aspx'
r = requests.get(url , headers = headers , params = params)
# print r.url
return r.url
def getinfo(url):
html = urllib2.urlopen(url).read()
id_str = '''<a href="http://5sing.kugou.com/fc/(.*?).html" title=".*" target="_blank">'''
id_pat = re.compile(id_str)
id_list = id_pat.findall(html)
name_str = '''<a href="http://5sing.kugou.com/fc/.*.html" title="(.*?)" target="_blank">'''
name_pat = re.compile(name_str)
name_list = name_pat.findall(html)
url_list = []
down_name_list = []
i = 0
for x in id_list:
down_url = redirect(x , 'fc')
url_list.append(down_url)
# down_name = '''5sing.kgimg.com/(.*?).mp3'''
# down_name_pat = re.compile(down_name)
# down_name_list.append({'downname' : down_name_pat.findall(down_url)[0] , 'songname' : name_list[i]})
down_name_list.append({
'downurl' : down_url , 'songname' : name_list[i]})
i += 1
return id_list,name_list,url_list,down_name_list
def download(down_name_list):
for x in down_name_list:
path = 'D:\mp3\三无marbule' + str(x['songname']) + '.mp3'
print path + ' downling~\n'
urllib.urlretrieve(str(x['downurl']),path)
print path + ' downloaded~\n'
page_num = 4
songtype = 'fc'
x =