网易云音乐评论爬取1.0

在这里插入图片描述

import requests
import json
import base64
import random
import time
from datetime import datetime
from Crypto.Cipher import AES
import pandas as pd
import os

agents = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {
    'Host': 'music.163.com',
    'Origin': 'https://music.163.com',
    'Referer': 'https://music.163.com/song?id=28793052',
    'User-Agent': ''.join(random.sample(agents, 1))
}
param2 = "010001"
param3 = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
param4 = "0CoJUm6Qyw8W8jud"
url = "https://music.163.com/weapi/comment/resource/comments/get?csrf_token="
class crawlComments(object):

    def __init__(self, song_data, page):
        self.page = page
        self.song_name, self.singer, self.song_id = song_data.split('|')  # 分解字符串
        self.df_all = pd.DataFrame()

    # 解密过程
    def AES_encrypt(text, key, iv):
        pad = 16 - len(text) % 16
        text = text + pad * chr(pad)
        text = text.encode("utf-8")
        encryptor = AES.new(key.encode('utf-8'), AES.MODE_CBC, iv)
        encrypt_text = encryptor.encrypt(text)
        encrypt_text = base64.b64encode(encrypt_text)
        return encrypt_text.decode('utf-8')

    # 获取参数
    def asrsea(p1, p2, p3, p4):
        res = {}
        rand_num = "aq9d7cvBOJ1tzj1o"
        vi = b"0102030405060708"
        h_encText = crawlComments.AES_encrypt(p1, p4, vi)
        h_encText = crawlComments.AES_encrypt(h_encText, rand_num, vi)
        res["encText"] = h_encText
        res["encSecKey"] = "5dec9ded1d7223302cc7db8d7e0428b04139743ab7e3d451ae47837f34e66f9a86f63e45ef20d147c33d88530a6c3c9d9d88e38586b42ee30ce43fbf3283a2b10e3118b76e11d6561d80e33ae38deb96832b1a358665c0579b1576b21f995829d45fc43612eede2ac243c6ebb6c2d16127742f3ac913d3ac7d6026b44cee424e"
        return res

    # 获取json格式的文件
    def get_json(self):
        react = False  # 响应机制
        for i in range(self.page):
            curr_time = int(time.time() * 1000)  # 获取当前时间
            param1 = json.dumps(
                {"csrf_token": "", "cursor": "%s" % curr_time, "offset": str(i * 20), "orderType": "2",
                 "pageNo": str(i + 1),
                 "pageSize": "20", "rid": "R_SO_4_" + self.song_id.strip(),
                 "threadId": "R_SO_4_" + self.song_id.strip()})
            # 获取参数
            asrsea_res = cc.asrsea(param1, param2, param3, param4)
            # 整理参数
            param_data = {"params": asrsea_res["encText"],
                          "encSecKey": asrsea_res["encSecKey"]}
            response = requests.post(url, headers=headers, data=param_data, verify=False)  # 返回信息
            sole_comments = json.loads(response.text)["data"]["comments"]
            if i == self.page - 1:  # 若当前歌曲爬取完成,响应机制改变,开始保存文件
                react = True
            cc.get_comments(sole_comments, react)  # 写入评论
            time.sleep(random.choice(range(1, 3)))

    # 写文件
    def get_comments(self, comment_json, react):
        dir = os.getcwd() + "\\comments"
        if not os.path.exists(dir):  # 判断当前路径是否存在,没有则创建新文件夹
            os.makedirs(dir)
        # 开始写入文件
        # with open('comments\\' + self.song_name + '--' + self.singer + '.txt', 'a+', encoding="utf-8") as f:
        for comment in comment_json:
            content = comment['content']  # 评论内容
            nick_name = comment['user']['nickname']  # 用户昵称
            time = datetime.fromtimestamp(comment['time'] / 1000.0).strftime('%Y-%m-%d %H:%M:%S')  # 解析时间
            liked_count = comment['likedCount']  # 点赞数
            self.df = pd.DataFrame({
                '用户昵称': nick_name,
                '评论': content,
                '评论时间': time,
                '点赞数': liked_count
            }, index=[0])
            self.df_all = self.df_all.append(self.df, ignore_index=True)
        if react == True:
            self.df_all.to_excel('comments\\' + self.song_name + '--' + self.singer + '.xlsx', index=False)
            # f.write(content + '\n')


if __name__ == '__main__':
    count = 0  # 计数
    start_time = time.time()  # 测定当前开始的时间
    with open('歌曲.txt', 'r', encoding="utf8") as f:  # 装载所有数据
        songs_data = f.readlines()
    page = int(input("请输入爬取页数:"))  # 输入爬取页数
    while count < len(songs_data):  # 当总数小于文件长度,继续执行
        print("正在抓取第%d首歌曲的评论..." % (count + 1))
        cc = crawlComments(songs_data[count], page)
        cc.get_json()
        # cc.main()
        count += 1
    end_time = time.time()  # 结束时间
    print("耗时%f秒。" % (end_time - start_time))

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值