1.在首页选取一个视频,检查网页源代码,查看评论所处的位置
这里爬取B站视频下面的用户昵称,性别,签名,内容,发布时间,归属地等信息
import requests
import datetime
import csv
url = 'https://api.bilibili.com/x/v2/reply/wbi/main?oid=919114382&type=1&mode=3&pagination_str=%7B%22offset%22:%22%22%7D&plat=1&seek_rpid=&web_location=1315875&w_rid=662e6cfa6de007f47971504fdd7361fb&wts=1701617200'
if __name__ == '__main__':
#创建保存文件以及相关配置
f = open('data.csv',mode='a',encoding='utf-8',newline='')
csv_writer = csv.DictWriter(f,fieldnames=[
'昵称',
'性别',
'签名',
'内容',
'发布时间',
'归属地',
])
csv_writer.writeheader()
headers={
'Cookie':#浏览器中的cookie
'Referer':'https://www.bilibili.com/video/BV16N411u76v/?spm_id_from=333.1007.top_right_bar_window_history.content.click&vd_source=1702d37bb5ca4b1f33292dd3a16bb428',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
response = requests.get(url=url,headers=headers)
json_data = response.json()
for index in json_data['data']['replies']:
try:
#发布时间
date = str(datetime.datetime.fromtimestamp(index['ctime']))
dit={
'昵称':index['member']['uname'],
'性别':index['member']['sex'],
'签名':index['member']['sign'],
'内容': index['content']['message'],
'发布时间':date,
'归属地':index['reply_control']['location'].replace('IP属地:',''),
}
csv_writer.writerow(dit)
print(dit)
except:
pass
结果保存为csv文件,查看文件内容所示
进行词云图显示,这里只选取评论部分,保存为txt文件
import jieba
import collections
import re
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType
from pyecharts import options as opts
from pyecharts.globals import ThemeType, CurrentConfig
#打开爬取的评论数据
with open('data.txt') as f:
data = f.read()
# 文本预处理
new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S) # 只要字符串中的中文
new_data = " ".join(new_data)
# 文本分词--精确模式分词
seg_list_exact = jieba.cut(new_data, cut_all=True)
result_list = []
with open('stop_words.txt', encoding='utf-8') as f:
con = f.readlines()
stop_words = set()
for i in con:
i = i.replace("\n", "") # 去掉读取每一行数据的\n
stop_words.add(i)
for word in seg_list_exact:
# 设置停用词并去除单个词
if word not in stop_words and len(word) > 1:
result_list.append(word)
print(result_list)
# 筛选后统计
word_counts = collections.Counter(result_list)
# 获取前100最高频的词
word_counts_top100 = word_counts.most_common(100)
# 可以打印出来看看统计的词频
print(word_counts_top100)
word1 = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px', theme=ThemeType.MACARONS))
word1.add('词频', data_pair=word_counts_top100,
word_size_range=[15, 108], textstyle_opts=opts.TextStyleOpts(font_family='cursive'),
shape=SymbolType.DIAMOND)
word1.set_global_opts(title_opts=opts.TitleOpts('评论词云图'),
toolbox_opts=opts.ToolboxOpts(is_show=True, orient='vertical'),
tooltip_opts=opts.TooltipOpts(is_show=True, background_color='red', border_color='yellow'))
word1.render("评论词云图.html")
输出结果