思路:
1.安装pip3 install bilibili-api-python(帮助文档bilibili-api 开发文档)
2.读一下帮助文档即可,真的很方便就能爬出数据,大佬把各种东西都写在上面了,感谢大佬
3.自行进行数据处理即可
代码实现:
video.py(数据收集部分)
from bilibili_api import bangumi, sync
import csv
import time
import random
md = [
['迷宫饭', 21174614],
['葬送的芙莉莲', 21087073],
['物理魔法使-马修- 神觉者候选人选拔考试篇', 21194297],
['物理魔法使-马修-', 20140467],
['我独自升级', 21194555],
['致不灭的你', 28233896],
['致不灭的你 第二季', 28339716],
['咒术回战', 28229899],
['咒术回战 第二季', 20310848],
['Free!', 120712],
['Free! -Eternal Summer-', 124972],
['冰海战记', 28220475],
['冰海战记 第二季', 20286653],
['间谍过家家', 28237119],
['间谍过家家 第二季', 21086686],
['紫罗兰永恒花园', 8892]
]
f = open('B站番剧长评论.csv', mode='w', encoding='utf-8', newline='') # 创建文件对象,保存数据
csv_writer = csv.DictWriter(f, fieldnames=[
'番剧名', '用户名', '头像', '会员等级', '账户等级',
'标题', '评论','追番进度',
'评论时间', '打分', '点赞量',
'回复数', '链接'
])
csv_writer.writeheader()
async def get_long_comment(md):
b = bangumi.Bangumi(md)
next = None
cmts = []
while next != 0:
cm = await b.get_long_comment_list(next=next)
cmts.extend(cm['list'])
next = cm['next']
# print(cmts)
return cmts
def write_comment(cmts, name):
for cmt in cmts:
name = name
uname = cmt['author']['uname']
avatar = cmt['author']['avatar'] # 头像
level = cmt['author']['level'] # 账户等级
text = cmt['author']['vip_label']['text'] # 会员等级
if text=="":
text="普通用户"
level = cmt['author']['level'] # 会员等级
title = cmt['title']
content = cmt['content']
progress = cmt.setdefault("progress")
content = cmt['content']
score = cmt['score']
push_time_str = cmt['push_time_str'] # 评论时间
likes = cmt['stat']['likes']
reply = cmt['stat']['reply']
url = cmt['url']
dit = {
'番剧名': name,
'用户名': uname,
'头像': avatar,
'会员等级': text,
'账户等级': level,
'标题': title,
'评论': content,
'追番进度': progress,
'评论时间': push_time_str,
'打分': score,
'点赞量': likes,
'回复数': reply,
'链接': url
}
csv_writer.writerow(dit)
# print(dit)
if __name__ == '__main__':
for i in md:
#time.sleep(random.random() * 3) # 设置睡眠时间
cmts = sync(get_long_comment(i[1]))
write_comment(cmts, i[0])
main.py(数据处理部分,画饼状图,画词云分析)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud
from PIL import Image
def level(read):
plt.rcParams['font.sans-serif'] = 'simhei'
plt.rcParams['axes.unicode_minus'] = False
# sum()方法是求和,下方是对行索引求和,sort_values是以值进行排序,ascending默认是升序
read = read.groupby('账户等级').count()['会员等级'].sort_values(ascending=False)
print(read.index)
print(read.values)
plt.pie(read.values, labels=read.index, autopct='%.1f%%')
plt.legend(read.index)
plt.show()
def vip(read):
plt.rcParams['font.sans-serif'] = 'simhei'
plt.rcParams['axes.unicode_minus'] = False
# sum()方法是求和,下方是对行索引求和,sort_values是以值进行排序,ascending默认是升序
read = read.groupby('会员等级').count()['回复数'].sort_values(ascending=False)
print(read.index)
print(read.values)
plt.pie(read.values, labels=read.index, autopct='%.1f%%')
plt.legend(read.index)
plt.show()
def wordcloud(read):
text = ''
for i in read:
text = str(text) + str(i)
# 我们需要将其分成一个个的词,这个时候就需要用到jieba模块了,代码如下
cut_text = jieba.cut(text)
result = " ".join(cut_text) # 将分好的词用某个符号分割开连成字符串
# 绘制词云图的核心代码
font = r'C:\Windows\Fonts\simfang.ttf'
wc = WordCloud(
background_color='white', # 设置背景色,默认为黑色
font_path=font, # 指定文字路径
width=500, # 设置背景宽
height=350, # 设置背景高
max_font_size=50, # 最大字体
min_font_size=10, # 最小字体
mode='RGBA' # 当参数为“RGBA”并且background_color不为空时,背景为透明
)
wc.generate(result) # 根据分词后的文本产生词云
wc.to_file(r"wordcloud.png") # 保存绘制好的词云图
plt.imshow(wc) # 以图片的形式显示词云
plt.axis("off") # 关闭图像坐标系,即不显示坐标系
plt.show() # plt.imshow()函数负责对图像进行处理,并显示其格式,但是不能显示。其后必须有plt.show()才能显示
if __name__ == '__main__':
# 读取CSV文件到Pandas DataFrame
read = pd.read_csv('long_comment.csv')
# print(df)
level(read)
# vip(read)
# wordcloud(read["评论"])