网易云音乐评论爬取、情感分析一体化

最新推荐文章于 2024-10-31 14:30:39 发布

小步吖

最新推荐文章于 2024-10-31 14:30:39 发布

阅读量7k

点赞数 6

分类专栏：爬虫 python 网易云情感分析

本文链接：https://blog.csdn.net/weixin_43133808/article/details/89174387

版权

python 同时被 3 个专栏收录

3 篇文章

订阅专栏

爬虫

2 篇文章

订阅专栏

网易云情感分析

1 篇文章

订阅专栏

开局一张图

网易云诞生了很多励志鸡汤，那么多的伤感流行句式，那么多微甜情话，今天我们就看他个天翻地覆，话不多说直接上个干货。

导入包、相关库

import requests
import math
import random
from Crypto.Cipher import AES
import codecs
import base64
import tkinter
from snownlp import SnowNLP
import matplotlib.pyplot as plt
import numpy as np

获取窗体内输入的内容

def go():
    global song_id
    x = str(entry1.get())
    print(x)  # 获取文本框的内容
    song_id = str(x)
def go1():
    global path2
    x = str(entry2.get())
    print(x)  # 获取文本框的内容
    path2 = str(x)
def go2():
    global file_name
    x = str(entry3.get())
    print(x)  # 获取文本框的内容
    file_name = str(x)
def go3():
    global cleaning_file
    x = str(entry4.get())
    print(x)  # 获取文本框的内容
    cleaning_file = str(x)
def go4():
    global cleaned_file
    x = str(entry5.get())
    print(x)  # 获取文本框的内容
    cleaned_file = str(x)
def go5():
    global analysis_path
    x = str(entry6.get())
    print(x)  # 获取文本框的内容
    analysis_path = str(x)

设置窗体大小、标题

win = tkinter.Tk()
win.geometry('500x500')
win.title('网易云音乐情感分析')

创建窗体的按钮，附带提示信息如输入规范

entry1 = tkinter.Entry(win, width=50, fg="black")
entry1.pack()
button = tkinter.Button(win, text="请输入id，输入后请点击我", command=go) 
 # 收到消息执行这个函数
button.pack()  # 加载到窗体

button1 = tkinter.Button(win, text="请输入存储文件的路径（以\结束，路径就可以，下一个输入框输入名称），输入后请点击我", command=go1,bg='yellow')  # 收到消息执行这个函数
entry2 = tkinter.Entry(win, width=50, fg="black",bg='yellow')
entry2.pack()
button1.pack()  # 加载到窗体

entry3 = tkinter.Entry(win, width=50, fg="black",bg='green')
entry3.pack()
button2 = tkinter.Button(win, text="请输入存储文件的名称（名称就可以，我们自动为您生成txt文件），输入后请点击我", command=go2,bg='green')  # 收到消息执行这个函数
button2.pack()  # 加载到窗体

entry4 = tkinter.Entry(win, width=50, fg="black",bg='gray')
entry4.pack()
button3 = tkinter.Button(win, text="请输入将要清洗文件的路径（具体到格式），输入后请点击我", command=go3,bg='gray')  # 收到消息执行这个函数
button3.pack()  # 加载到窗体

entry5 = tkinter.Entry(win, width=50, fg="black",bg='pink')
entry5.pack()
button4 = tkinter.Button(win, text="请输入清洗完毕文件的路径（具体到格式），输入后请点击我", command=go4,bg='pink')  # 收到消息执行这个函数
button4.pack()  # 加载到窗体

entry6 = tkinter.Entry(win, width=50, fg="black",bg='orange')
entry6.pack()
button5 = tkinter.Button(win, text="请输入进行情感分析文件的路径（具体到格式），输入后请点击我", command=go5,bg='orange')  # 收到消息执行这个函数
button5.pack()  # 加载到窗体

button6 = tkinter.Button(win,text='全部输入完毕请点击我',command=win.destroy).pack()
win.mainloop()

构造函数获取歌手信息

def get_comments_json(url, data):
    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
               'Accept-Encoding': 'gzip, deflate',
               'Accept-Language': 'zh-CN,zh;q=0.9',
               'Connection': 'keep-alive',
               'Cookie': 'Province=****; City=****;   #说明此处为省份、城市代码，如山东济南为0530，0531建议用自定获取到的
 _ntes_nnid=861aff69ca0c1a71635a5ed2a0243acb,1544690244162; _ntes_nuid=861aff69ca0c1a71635a5ed2a0243acb; UM_distinctid=167a6b6a0a83f1-00127701049a17-335a497c-e1000-167a6b6a0a9517; usertrack=ezq0o1wSGjl5etI0BD5JAg==; vjuids=2a1d704ac.167a6b6cb26.0.b0103dac2188b; vjlast=1544690257.1544690257.30; nteslogger_exit_time=1544692086080; vinfo_n_f_l_n3=8a31831dccfd73bf.1.0.1544690256693.0.1544692154822; JSESSIONID-WYYY=HYgCofY5xb%5Cbn0UObOx4nvEqF1Akb3e%2Fh%2FzcPVbhWyj1KaJZnTusNDfyT5mWEBuSWSJ9uNs5G%2BTpVkenwYj1V7CpefhlP9FP6RtFWxFrbWIbsKPMFQo8lV58%2FrH%2BsHf42oU20b1lqMfoHApESJqjCDM9Mtgs2WRkXWs4Qbb4WTmcIipY%3A1545471673818; _iuqxldmzr_=32; __utma=94650624.195620955.1545469875.1545469875.1545469875.1; __utmc=94650624; __utmz=94650624.1545469875.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; WM_NI=HehEClh%2F
 %2FQUZj98wglZfRgNpbsu1q9m2HxBPcS9UkOXXysR7gOXojWNn82ueE5kAzm4tLz3eUvdfIZTqY5%2BVheKLttjo3RnK9Bho7dWiyA6FIqm7%2BVm5tA61RUEIYGa%2BQ3k%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eeb4d7639bb9f7bad16ea7b88fa7c54f879f9a85bc5cb686a2b5d364bab287bac72af0fea7c3b92a8cbba8a6c268acf1e1b2d852aae7f898e6689aba9e88cd7ca2adbbd2f433afee8899c15ca18df7b6ea459c88b794d13da5919890e95d8eb2f8d3f86f87eba5a2f967f8ac849bb26f97aaac87cc5298af97d6aa5eacbb85adf780aab1fdb3ed41fb9ea890b67095b7b7a2b54e8bafa3d8aa5eafebbca2b53b928b8baadc4da3f59fd4ea37e2a3; WM_TID=U0nsAG4m95hEAFVVFVZ8fqJzHf1jkqZC; __utmb=94650624.7.10.1545469875',
               'Host': 'music.163.com',
               'Referer': 'http://music.163.com/',
               'Upgrade-Insecure-Requests': '1',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/66.0.3359.181 Safari/537.36'}
      try:
        r = requests.post(url, headers=headers, data=data)
        r.encoding = "utf-8"
        if r.status_code == 200:
            # 返回json格式的数据
            return r.json()          
      except:
        print("爬取失败!")

生成16个随机字符

def generate_random_strs(length):
    string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    # 控制次数参数i
    i = 0
    # 初始化随机字符串
    random_strs = ""
    while i < length:
        e = random.random() * len(string)
        # 向下取整
        e = math.floor(e)
        random_strs = random_strs + list(string)[e]
        i = i + 1
    return random_strs

AES加密

def AESencrypt(msg, key):
    # 如果不是16的倍数则进行填充(paddiing)
    padding = 16 - len(msg) % 16
    # 这里使用padding对应的单字符进行填充
    msg = msg + padding * chr(padding)
    # 用来加密或者解密的初始向量(必须是16位)
    iv = '0102030405060708'
    
    cipher = AES.new(key, AES.MODE_CBC, iv)
    # 加密后得到的是bytes类型的数据
    encryptedbytes = cipher.encrypt(msg)
    # 使用Base64进行编码,返回byte字符串
    encodestrs = base64.b64encode(encryptedbytes)
    # 对byte字符串按utf-8进行解码
    enctext = encodestrs.decode('utf-8')
         
    return enctext

RSA加密

def RSAencrypt(randomstrs, key, f):
    # 随机字符串逆序排列
    string = randomstrs[::-1]
    # 将随机字符串转换成byte类型数据
    text = bytes(string, 'utf-8')
    seckey = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16)
    return format(seckey, 'x').zfill(256)

获取参数

def get_params(page):
    # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) *　20", "limit":"20"},offset和limit这两个参数必须有(js)
    # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的
    # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}'
    # 偏移量
    offset = (page - 1) * 20
    # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成
    msg = '{"offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
    key = '0CoJUm6Qyw8W8jud'
    f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
    e = '010001'
    enctext = AESencrypt(msg, key)
    # 生成长度为16的随机字符串
    i = generate_random_strs(16)
      # 两次AES加密之后得到params的值
    encText = AESencrypt(enctext, i)
    # RSA加密之后得到encSecKey的值
    encSecKey = RSAencrypt(i, e, f)
    return encText, encSecKey

最主要的部分来了

def comments(html, songname, i, pages, total, filepath):
    # with open(filepath, 'a', encoding='utf-8') as f:
    #     f.write("\n正在获取歌曲{}的第{}页评论,总共有{}页{}条评论！\n".format(songname, i, pages, total))
    print("{}/{}\n".format( i, pages))
    # 全部评论
    j = 1
    for item in html['comments']:
        # 提取发表评论的用户名
        user = item['user']
        # print("全部评论{}: {} : {}    点赞次数: {}".format(j, user['nickname'], item['content'], item['likedCount']))
        with open(filepath, 'a', encoding='utf-8') as f:
            f.write(item['content'])
            f.write('\n')
            f.close()
      j += 1
    f.close()

歌曲id号

songid = song_id
filepath = path2

歌曲名字

songname = file_name
print(songid)
print(filepath)
print(songname)

文件存储路径

filepath = filepath  + songname + ".txt"
page = 1
params, encSecKey = get_params(page)

获取第一页评论

url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(songid) + '?csrf_token='
data = {'params': params, 'encSecKey': encSecKey}
# url = 'https://music.163.com/#/song?id=19292984'
# 获取第一页评论
html = get_comments_json(url, data)
# 评论总数
total = html['total']
    # 总页数
pages = math.ceil(total / 20)
# hotcomments(html, songname, page, pages, total, filepath)
comments(html, songname, page, pages, total, filepath)

获取全部评论

page = 2
while page <= pages:
    params, encSecKey = get_params(page)
    data = {'params': params, 'encSecKey': encSecKey}
    html = get_comments_json(url, data)
    # 从第二页开始获取评论
    comments(html, songname, page, pages, total, filepath)
    page += 1

数据预处理

filepath1 = cleaning_file
file1 = open (filepath1,'r',encoding='utf-8')
filepath2 = cleaned_file
file2 = open(filepath2,'w',encoding='utf-8')
try:
    print("转换中......")
    for line in file1.readlines():
        if line == '\n' :
            line = line.strip("\n")
        file2.write(line)
 finally:
    file1.close()
    file2.close()
    print("执行完毕！")

情感分析SnowNLP及可视化

comment = []
with open(analysis_path, mode='r', encoding='utf-8') as f:
    rows = f.readlines()
    # print(rows)
    for row in rows:
        if row not in comment:
            comment.append(row.strip('\n'))
    # print(comment)
def snowanalysis(self):
    sentimentslist = []
    for li in self:
        print(li)
        s = SnowNLP(li)
        print(s.sentiments)
        sentimentslist.append(s.sentiments)
    plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01))
    plt.show()
    print(sentimentslist)
  
    for i in range(len(sentimentslist)):
        if (sentimentslist[i]>0.5):
            sentimentslist[i]=1
        else:
            sentimentslist[i]=-1
    print(sentimentslist)
    info=[]
    a=0
    b=0
    for x in range(0,len(sentimentslist)):
        if(sentimentslist[x]==1):
            a=a+1
        else:
            b=b+1
    info.append(b)
    info.append(a)
    print(info)
    info2=['negative','positive']
    plt.bar(info2,info,tick_label=info2,color='#2FC25B')
    plt.show()
snowanalysis(comment)