import requests
import json
import base64
import random
import time
from datetime import datetime
from Crypto.Cipher import AES
import pandas as pd
import os
agents = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {
'Host': 'music.163.com',
'Origin': 'https://music.163.com',
'Referer': 'https://music.163.com/song?id=28793052',
'User-Agent': ''.join(random.sample(agents, 1))
}
param2 = "010001"
param3 = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
param4 = "0CoJUm6Qyw8W8jud"
url = "https://music.163.com/weapi/comment/resource/comments/get?csrf_token="
class crawlComments(object):
def __init__(self, song_data, page):
self.page = page
self.song_name, self.singer, self.song_id = song_data.split('|') # 分解字符串
self.df_all = pd.DataFrame()
# 解密过程
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
text = text.encode("utf-8")
encryptor = AES.new(key.encode('utf-8'), AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text.decode('utf-8')
# 获取参数
def asrsea(p1, p2, p3, p4):
res = {}
rand_num = "aq9d7cvBOJ1tzj1o"
vi = b"0102030405060708"
h_encText = crawlComments.AES_encrypt(p1, p4, vi)
h_encText = crawlComments.AES_encrypt(h_encText, rand_num, vi)
res["encText"] = h_encText
res["encSecKey"] = "5dec9ded1d7223302cc7db8d7e0428b04139743ab7e3d451ae47837f34e66f9a86f63e45ef20d147c33d88530a6c3c9d9d88e38586b42ee30ce43fbf3283a2b10e3118b76e11d6561d80e33ae38deb96832b1a358665c0579b1576b21f995829d45fc43612eede2ac243c6ebb6c2d16127742f3ac913d3ac7d6026b44cee424e"
return res
# 获取json格式的文件
def get_json(self):
react = False # 响应机制
for i in range(self.page):
curr_time = int(time.time() * 1000) # 获取当前时间
param1 = json.dumps(
{"csrf_token": "", "cursor": "%s" % curr_time, "offset": str(i * 20), "orderType": "2",
"pageNo": str(i + 1),
"pageSize": "20", "rid": "R_SO_4_" + self.song_id.strip(),
"threadId": "R_SO_4_" + self.song_id.strip()})
# 获取参数
asrsea_res = cc.asrsea(param1, param2, param3, param4)
# 整理参数
param_data = {"params": asrsea_res["encText"],
"encSecKey": asrsea_res["encSecKey"]}
response = requests.post(url, headers=headers, data=param_data, verify=False) # 返回信息
sole_comments = json.loads(response.text)["data"]["comments"]
if i == self.page - 1: # 若当前歌曲爬取完成,响应机制改变,开始保存文件
react = True
cc.get_comments(sole_comments, react) # 写入评论
time.sleep(random.choice(range(1, 3)))
# 写文件
def get_comments(self, comment_json, react):
dir = os.getcwd() + "\\comments"
if not os.path.exists(dir): # 判断当前路径是否存在,没有则创建新文件夹
os.makedirs(dir)
# 开始写入文件
# with open('comments\\' + self.song_name + '--' + self.singer + '.txt', 'a+', encoding="utf-8") as f:
for comment in comment_json:
content = comment['content'] # 评论内容
nick_name = comment['user']['nickname'] # 用户昵称
time = datetime.fromtimestamp(comment['time'] / 1000.0).strftime('%Y-%m-%d %H:%M:%S') # 解析时间
liked_count = comment['likedCount'] # 点赞数
self.df = pd.DataFrame({
'用户昵称': nick_name,
'评论': content,
'评论时间': time,
'点赞数': liked_count
}, index=[0])
self.df_all = self.df_all.append(self.df, ignore_index=True)
if react == True:
self.df_all.to_excel('comments\\' + self.song_name + '--' + self.singer + '.xlsx', index=False)
# f.write(content + '\n')
if __name__ == '__main__':
count = 0 # 计数
start_time = time.time() # 测定当前开始的时间
with open('歌曲.txt', 'r', encoding="utf8") as f: # 装载所有数据
songs_data = f.readlines()
page = int(input("请输入爬取页数:")) # 输入爬取页数
while count < len(songs_data): # 当总数小于文件长度,继续执行
print("正在抓取第%d首歌曲的评论..." % (count + 1))
cc = crawlComments(songs_data[count], page)
cc.get_json()
# cc.main()
count += 1
end_time = time.time() # 结束时间
print("耗时%f秒。" % (end_time - start_time))
网易云音乐评论爬取1.0
最新推荐文章于 2024-03-14 13:11:30 发布