标签:爬虫
网址示例: https://node.kg.qq.com/personal?uid=639e9983222a338a
直接上源码:
import requests
import time
import re
import json
import pprint
import math
import os
header={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
SongList=[]
song_baseurl="http://node.kg.qq.com/play"
album_baseurl="http://node.kg.qq.com/cgi/fcgi-bin/kg_ugc_get_homepage"
def Down(url_file, filePath, FileDir):
if not os.path.isdir(FileDir): os.makedirs(FileDir)
if os.path.isfile(FileDir + "/" + filePath):
print(filePath + " --已存在")
return 0
try:
r = requests.get(url_file, stream=True)
with open(FileDir + "/" + filePath, "wb") as f:
size = int(r.headers['content-length'])
title = " 当前下载-" + filePath + " 文件大小:" + size + "字节"
print('\033[0;31m' + title + "\033[0m")
CurTotal = 0
for chunk in r.iter_content(chunk_size=512 * 1024):
if chunk:
f.write(chunk)
CurTotal += len(chunk)
print("\r" + filePath + "--下载进度:" + '%3s' % (str(CurTotal * 100 // size)) + "%", end='')
print()
r.close()
except Exception as e:
print(filePath + " 下载出错!" + " 错误信息" + str(e.args))
if os.path.isfile(FileDir + "/" + filePath): os.remove(FileDir + "/" + filePath)
def GetData(data,url):
response = requests.get(url,params=data, headers=header)
return response.content.decode("utf-8")
def Parse_Song_Info(content):
jsonobj = re.findall(r'window.__DATA__ = (.*?); ', content)
if len(jsonobj) > 0:
data = json.loads(jsonobj[0])
# pprint.pprint(data)
obj={}
obj[ "name"]=data['detail']['song_name']
if data['detail']['playurl']:
print(data['detail']['song_name']+" 音乐:"+data['detail']['playurl'])
obj["url"]=data['detail']['playurl']
obj["type"]=".mp3"
else:
print(data['detail']['song_name'] + " 视频:" + data['detail']['playurl_video'])
obj["url"] = data['detail']['playurl_video']
obj["type"] = ".mp4"
SongList.append(obj)
else:
print("没有爬取到")
def GetSongsByIndex(uid, Is_Parse, page):
data={
'jsonpCallback':'callback_0',
'g_tk':'5381',
'outCharset':'utf-8',
'format':'jsonp',
'type':'get_ugc',
'start':str(page),
'num':'8',
'touin':'',
'share_uid':uid,
'g_tk_openkey':'5381',
'_':str(int(time.time()*1000))
}
response=requests.get(album_baseurl,params=data,headers=header)
jsonobj=re.findall(r'callback_\d\((.*)\)',response.content.decode("utf-8"))
if len(jsonobj)>0:
data=json.loads(jsonobj[0])
# pprint.pprint(data)
count=data['data']['ugc_total_count']
if Is_Parse:
time.sleep(1)
for obj in data['data']['ugclist']:
print(obj['title']+" -- "+obj['shareid'])
songdata = {
"s": obj['shareid']
}
content=GetData(songdata,song_baseurl)
Parse_Song_Info(content)
else:
pprint.pprint("共计:" + str(count))
return count
else:
print("没有爬取到")
return 0
def Run(uid):
count=GetSongsByIndex(uid, False, 1)
if count!=0:
for page in range(1,math.ceil(count/8)+1):
GetSongsByIndex(uid, True, page)
else:
print("该用户没有歌曲")
if __name__=="__main__":
Run('639e9983222a338a')
for s in SongList:
Down(s["url"],s["name"]+s["type"],"小小")
# print(s)
将Run 括号里面的字符串换为 歌手主页链接后面的uid
“小小” 可自定义文件夹
标签:爬虫
来源: https://www.cnblogs.com/yuanzessrs/p/10247347.html