网易热歌热评数据网络爬虫程序源码

最新推荐文章于 2024-03-04 15:26:51 发布

青青传媒

最新推荐文章于 2024-03-04 15:26:51 发布

阅读量637

点赞数

文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_47000687/article/details/131556874

版权

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse
import json

from Crypto.Cipher import AES
import base64

import requests,pprint,json

url_page = 'http://music.163.com/discover/toplist?id=19723756' # 飙升榜url 下载网页分析数据
urllib.request.urlretrieve(url_page,'toplist.html')
url_page1 = 'https://music.163.com/discover/artist'

#调式网易音乐页面js得到的加密
params = ""
_i = "l6Brr86UeZ6C3Bsw" # 默认使用此字符串
encSecKey = "7ca9b5ba8b13044f47ed74c388df912ac84758122acbedc64111f2ac83232b01d3ce16f7195a39c7e064b4c0240b5c1d52624dc13c22ec820d76dfe32db43e496aeacced5be3ca9108c78a85bb389f1edf8d8c9fced02024ba9490401b4ce062cc50764d0a24294e07bb229271391b5a3640e924ee1ed15435dc6e288f1fa873"
def cryptjscomplex(text):
BS = AES.block_size
pad = lambda s: s + (BS - len(s) % BS) * chr(BS - len(s) % BS).encode('utf-8')
unpad = lambda s: s[0:-s[-1]]
key = bytes(_i, encoding="utf-8")
text = text.encode("utf-8")
IV = b'0102030405060708'
cipher = AES.new(key, mode=AES.MODE_CBC, IV=IV)
# cipher2 = AES.new(key, mode=AES.MODE_CBC, IV=IV) # 加密和解密，cipher对象只能用一次
# print(text)
encrypted = pad(text)
# print(encrypted)
encrypted = cipher.encrypt(encrypted)
# print(encrypted)
encrypted = base64.b64encode(encrypted).decode("utf-8")
# print("第二次加密结果", encrypted)
return encrypted

def cryptjscomplexbase(text):
BS = AES.block_size
pad = lambda s: s + (BS - len(s) % BS) * chr(BS - len(s) % BS).encode('utf-8')
unpad = lambda s: s[0:-s[-1]]
key = b'0CoJUm6Qyw8W8jud'
text = text.encode("utf-8")
IV = b'0102030405060708'
cipher = AES.new(key, mode=AES.MODE_CBC, IV=IV)
# cipher2 = AES.new(key, mode=AES.MODE_CBC, IV=IV) # 加密和解密，cipher对象只能用一次
# print(text)
encrypted = pad(text)
# print(encrypted)
encrypted = cipher.encrypt(encrypted)
# print(encrypted)
encrypted = base64.b64encode(encrypted).decode("utf-8")
# print("第一次加密结果", encrypted)
return encrypted

# 获得parms参数值
def getparams(text):
return cryptjscomplex(
cryptjscomplexbase(text),)

def gethotSong(model): # 获取歌曲名称和id

# url1 = 'http://music.163.com/discover/toplist?id=19723756' # 飙升榜
# url2 = 'http://music.163.com/discover/toplist?id=3779629' # 新歌榜url
# url3 = 'http://music.163.com/discover/toplist?id=2884035' # 原创歌榜ur
# url = 'http://music.163.com/discover/toplist?id=3778678' # 热歌榜url
url = model["url"]
typename = model["typename"]

header = {
'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=header)
html = urllib.request.urlopen(request).read().decode('utf8')
html = str(html)
pat1 = r'<ul class="f-hide"><li><a href="/song\?id=\d*?">.*</a></li></ul>'
result = re.compile(pat1).findall(html)
# print(len(result))
result = result[0]

pat2 = r'<li><a href="/song\?id=\d*?">(.*?)</a></li>'
pat3 = r'<li><a href="/song\?id=(\d*?)">.*?</a></li>'
hot_song_name = re.compile(pat2).findall(result)
hot_song_id = re.compile(pat3).findall(result)
return hot_song_name, hot_song_id,typename

def gethotComments(hot_song_name, hot_song_id):
url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + hot_song_id + '?csrf_token='
header = {
'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
#通过页面手动获取参数
data1 = {
'params': 'zC7fzWBKxxsm6TZ3PiRjd056g9iGHtbtc8vjTpBXshKIboaPnUyAXKze+KNi9QiEz/IieyRnZfNztp7yvTFyBXOlVQP/JdYNZw2+GRQDg7grOR2ZjroqoOU2z0TNhy+qDHKSV8ZXOnxUF93w3DA51ADDQHB0IngL+v6N8KthdVZeZBe0d3EsUFS8ZJltNRUJ',
'encSecKey': '4801507e42c326dfc6b50539395a4fe417594f7cf122cf3d061d1447372ba3aa804541a8ae3b3811c081eb0f2b71827850af59af411a10a1795f7a16a5189d163bc9f67b3d1907f5e6fac652f7ef66e5a1f12d6949be851fcf4f39a0c2379580a040dc53b306d5c807bf313cc0e8f39bf7d35de691c497cda1d436b808549acc'}

data = {
'params': 'tn3BOnrm4yHwP8/rNO67PXkdFsIZr1GxkQwludtNY5XPY781W82g3aiHhY+bd4quT7p6EH25qBqfLFdsxCMl0CI73mfflrOUFbK6vGE4g4tW9tfNiHhJucks1/wEsO/RaxWNo2JwQ1KmvO5pAMGYeg==',
'encSecKey': '9f9ff771dbf2a8de2091e6e90da84fa20231b2c4c17e1830ce41a0e9128e71568b9968ae9944f6ff5cf58b711805200a1a586a7682f2500b98751c43866957975c9ff0092555efb27d558243da6f7331bdb80e12f560ccf9dcc46d05a66707b5e007ad3bbf2937c89b17339083d7dd4dc4099f5ad0b2686293e6d941e7f09559'
}
postdata = urllib.parse.urlencode(data).encode('utf8')
request = urllib.request.Request(url, headers=header, data=postdata)
reponse = urllib.request.urlopen(request).read().decode('utf8')
json_dict = json.loads(reponse)
hot_commit = json_dict['hotComments']
# 获取歌曲文件URL

song_id_,song_id, song_time,song_url =gethotsongurl(hot_song_id)
# print(song_id_,song_id,song_time,song_url)
num = 0
fhandle = open('./song_comments', 'a', encoding='utf-8') #a 追加
fhandle.write(hot_song_name + ':' + '\n')
#写入歌曲文件URL
if(song_url==None):
# print("vip")
fhandle.write("VIP歌曲无法下载"+ '\n')
else:

fhandle.write("歌曲地址" + ':' +song_url+ '\n')


for item in hot_commit:
num += 1
fhandle.write(str(num) + '.' + item['content'] + '\n')
fhandle.write('\n==============================================\n\n')
fhandle.close()
#下载音乐文件
def gethotsongfile(hot_song_id):
hotsongurl = hot_song_id
print(hotsongurl)
# 搜索指定歌曲文件

def gethotsongurl(song_id_in):
song_url='https://music.163.com/weapi/song/enhance/player/url/v1?csrf_token=' #歌曲名称
header = {
'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
text='{"ids":["%s"],"level":"standard","encodeType":"aac","csrf_token":""}'%(song_id_in)
# ids: JSON.stringify([this.cw8o.id]),
# level: DEFAULT_LEVEL,
# encodeType: DEFAULT_ENCODETYPE
params = (
('csrf_token', ''),
)
data = {
'params': getparams(text),
'encSecKey': encSecKey
}
response = requests.post(song_url,headers=header,params=params,data=data)
# print(response.status_code)
# resjson=json.loads(response.text)
# print(resjson)
# pprint.pprint(response.json())
# print(json.loads(response.text))
json_dict=json.loads(response.text)
song_id_ =None
song_time =None
song_url =None
for item in json_dict['data']:
song_id = item['id']
song_time = item['time']
song_url = item['url']
song_id_ = song_id_in
# print(item['id'])
# print(item['time'])
# print(item['url'])
return song_id_,song_id, song_time,song_url
#下载歌词
def gethotsonglyric(hot_song_id):
hotsongurl = hot_song_id
print(hotsongurl)

if __name__ == '__main__':
# gethotsongurl("1992051395")
urls=[
{'url':'http://music.163.com/discover/toplist?id=19723756','typename':'飙升'},
{'url':'http://music.163.com/discover/toplist?id=3779629' ,'typename':'新歌'},
{'url':'http://music.163.com/discover/toplist?id=2884035' ,'typename':'原创'},
{'url':'http://music.163.com/discover/toplist?id=3778678' ,'typename':'热歌'}
]

for url in urls:
print(url)
print(url["url"])
print(url["typename"])
hot_song_name, hot_song_id,typename = gethotSong(url) # 获取歌曲名称和id

num = 0
while num < len(hot_song_name): # 保存热评
print('正在抓取%s第%d首歌曲热评...' % (typename,num + 1))
gethotComments(hot_song_name[num], hot_song_id[num])
print('第%d首歌曲热评抓取成功' % (num + 1))
num += 1