输入b站的视频URL返回弹幕可视化图

最新推荐文章于 2024-07-21 21:32:53 发布

钱甫新

最新推荐文章于 2024-07-21 21:32:53 发布

阅读量796

点赞数

文章标签： python 数据分析数据挖掘可视化

本文链接：https://blog.csdn.net/Mr_Qian_Ives/article/details/108271446

版权

import json
from urllib import parse

import jieba
import matplotlib.pyplot as plt
import requests
from wordcloud import WordCloud
from xmltodict import parse

'''
语云图可以展示当前视频的关键字（首先剔除无用信息，例如 哈哈哈、啊啊啊）
折线图可以显示当前视频的热度走向（哪天火起来的等等）
后期可以补充折线图
'''
allContentAndTime = []  # 所有的弹幕和发送弹幕的时间
allContent = []  # 所有的弹幕
allWord = []  # 所有的词语


# 从URL得到BV
def from_URL_Get_BV(url):
    return url.split('/')[-1][:12]


# 从bv得到Cid
def from_Bv_Get_Cid(bv):
    # 拼接url
    url = 'https://api.bilibili.com/x/player/pagelist?bvid=' + str(bv) + '&jsonp=jsonp'
    # 设置爬虫的header
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"}
    # 得到网页内容
    response = requests.get(url, headers=headers)
    # 设置编码
    response.encoding = 'utf-8'
    # 得到html
    html = response.text
    # 转换成字典
    dic = json.loads(html)
    cid = dic['data'][0]['cid']  # 抓取cid
    return cid


# 从Cid获取弹幕
def from_Cid_Get_DanMu(cid):
    # 拼接url
    url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
    # 设置爬虫的header
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"}
    # 得到网页内容
    response = requests.get(url, headers=headers)
    # 设置编码
    response.encoding = 'utf-8'
    html = response.text
    # 将xml转为有序字典
    data = parse(html)
    allDanMu = data['i']['d']  # 得到d标签的数据（弹幕数据）
    global allContent, allContentAndTime  # 设置全局变量
    for i in range(len(allDanMu)):
        allData = str(allDanMu[i]['@p']).split(',')  # 抓去发布时间、发布人等数据
        time = allData[4]
        content = allDanMu[i]['#text']  # 抓去弹幕内容
        # 加入容器
        allContent.append(content)
        allContentAndTime.append(time + ":" + content)


# 爬虫流程
def parse_Html(url):
    bv = from_URL_Get_BV(url)
    cid = from_Bv_Get_Cid(bv)
    from_Cid_Get_DanMu(cid)


# 从弹幕到词
def from_Danmu_to_word():
    global allWord  # 设置全局变量
    # 每句话结尾用句号
    allContentToStr = '。'.join(allContent)
    # jieba分割词
    allWord = jieba.lcut(allContentToStr, cut_all=True)


# 从word转换为wordcould
def from_Word_To_WordCloud():
    # 设置字体格式（否则中文乱码）
    font_path = '/System/Library/Fonts/STHeiti Medium.ttc'
    wc = WordCloud(
        scale=9,  # 分辨率  数字越大 分辨率越高  制造时间越长
        stopwords=['哈', '哈哈', '哈哈哈', '哈哈哈哈', '哈哈哈哈哈'],  # 你不想要的数据
        background_color='white',  # 背景颜色
        max_font_size=60,  # 字体最大值
        random_state=20,  # 配色
        font_path=font_path  # 字体
    )
    # wordcloud需要的数据格式：数据以空格排列
    wordcloudData = ' '.join(allWord)
    # 生成词云
    myword = wc.generate(wordcloudData)
    # 展示词云图
    plt.imshow(myword)
    # 不展示x轴
    plt.axis('off')
    # plt展示
    plt.show()
    # 保存到本地
    myword.to_file('result.png')


if __name__ == '__main__':
    url = input('输入b站视频的URL：')
    parse_Html(url)
    print('共爬取到' + str(len(allContent)) + '条弹幕')
    from_Danmu_to_word()
    print('共分割为' + str(len(allWord)) + '个词语')
    from_Word_To_WordCloud()