1. 打开一个视频
2. 查看抓包信息找到弹幕接口
3. 请求弹幕接口即可得到包含弹幕的xml
4. 弹幕接口url带有oid参数,需要先获取oid,在网页源代码中搜索oid的值发现cid就是oid,可以通过正则来提取cid
5. 代码实现
# coding=utf-8
import re
import requests
import jieba
import pandas
import matplotlib.pyplot as plt
from lxml import etree
from wordcloud import WordCloud
url = input('请输入B站短视频链接:')
# url = 'https://www.bilibili.com/video/BV11p411o73u'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
# 获取cid和视频标题
html_text = requests.get(url=url, headers=headers).text
cid = re.search('cid=(\d+)&aid=\d+', html_text).group(1)
html = etree.HTML(html_text)
title = html.xpath("//span[@class='tit']/text()")[0]
# 获取弹幕列表
response = requests.get('https://api.bilibili.com/x/v1/dm/list.so?oid={}'.format(cid), headers=headers)
xml = etree.fromstring(response.content)
danmu_list = xml.xpath("/i/d/text()")
# 把弹幕存到csv
dm_df = pandas.DataFrame(danmu_list)
dm_df.to_csv('弹幕-{}.csv'.format(title), encoding='utf_8_sig', header=None, index=None) # 'utf_8_sig'解决保存到csv乱码问题
with open('弹幕-{}.csv'.format(title), 'r', encoding='utf-8') as f:
txt = f.read()
# 结巴分词
txt = jieba.lcut(txt)
txt = ' '.join(txt)
# 读取词云图背景图片
mask_ima = plt.imread('heart.jpg')
# 生成词云图
wordcloud = WordCloud(font_path='msyh.ttc', # 字体微软雅黑
min_font_size=10, # 最小字体大小
max_words=1000, # 最大词数
max_font_size=150, # 最大字体大小
stopwords={' '}, # 屏蔽词语
# width=1000, # 词云图宽度
# height=1000, # 词云图高度
mask = mask_ima, # 自定义背景图
background_color='black' # 背景颜色
).generate(txt)
# 保存词云图
wordcloud.to_file('B站弹幕词云图.png')
保存到csv的弹幕数据
词云图效果