Python爬取B站弹幕并做成云图

置顶 JiuQing.

已于 2024-05-04 14:57:36 修改

阅读量948

点赞数 21

文章标签： python 开发语言网络爬虫

于 2023-12-27 17:06:21 首次发布

本文链接：https://blog.csdn.net/weixin_52776616/article/details/135250789

版权

#创作灵感#

之前的代码思路失效了，已于5.4更新了代码，现在完全可以用了

学习python新手项目，并慢慢完善，最后做成了全新思路运行速度也比较快的脚本

并且添加了ui界面可以自动判断地址是否正确，视频或者弹幕是否存在最后生成弹幕词云

接下来就是代码完整片段，仅供参考可以自己完善优化

import re
import requests
import xml.etree.ElementTree as ET
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import tkinter as tk
 
def generate_wordcloud(event=None):
# 从用户获取输入
    video_url = entry.get()
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Referer': 'https://www.bilibili.com/'
}
 
# 进行地址格式检查
    pattern = r'^https://www.bilibili.com/video/BV.*$'
    match = re.match(pattern, video_url)
    if not match:
        status_label.config(text="地址格式不正确，请重新输入")
        return
 
    status_label.config(text="地址格式正确，正在解析视频弹幕，请稍等...")
    window.update()
 
# 提取 CID
    def extract_cid(video_url):
        bv = re.search(r'BV(\w+)', video_url)
        if bv:
            api_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bv.group(1)}'
            try:
                response = requests.get(api_url, headers=headers)
                data = response.json()
                if data['code'] == 0 and len(data['data']) > 0:
                    return data['data'][0]['cid']
            except requests.RequestException:
                pass
# 连续请求获取会有时会返回None，这里做了个循环，结果为None重新请求获取，连续五次获取均为None则地址不正确
    cid = None
    attempts = 0
    while cid is None and attempts < 5:
        cid = extract_cid(video_url)
        attempts += 1
 
    if not cid:
        status_label.config(text="无法获取视频的 CID，请检查视频地址是否正确")
        return
 
# 弹幕访问 URL
    danmaku_url = f'https://comment.bilibili.com/{cid}.xml'
 
    # 获取弹幕数据
    try:
        response = requests.get(danmaku_url)
        response.encoding = response.apparent_encoding
        danmaku_xml = response.text
    except requests.RequestException:
        status_label.config(text="无法获取弹幕数据")
        return
 
    # 解析弹幕数据
    try:
        danmaku_root = ET.fromstring(danmaku_xml)
        danmakus = danmaku_root.findall('d')
 
        if not danmakus:
            status_label.config(text="弹幕列表为空")
            return
 
        status_label.config(text="弹幕解析成功，正在生成云图...")
        window.update()
    except ET.ParseError:
        status_label.config(text="解析弹幕数据时发生错误")
        return
 
    # 统计每个弹幕的出现次数
    counter = Counter(danmaku.text for danmaku in danmakus)
 
    # 生成云图
    font_path = 'msyh.ttc'
    wordcloud = WordCloud(font_path=font_path, width=800, height=800, background_color='white', colormap='YlOrBr').generate_from_frequencies(counter)
    status_label.config(text="云图生成成功！")
    window.update()
 
    # 显示云图
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    status_label.config(text="")
    window.update()
 
def on_focus_in(event):
    if entry.get() == "https://www.bilibili.com/video/BVXXXXXXXXXX":
        entry.delete(0, tk.END)
        entry.config(fg="black")
 
def on_focus_out(event):
    if entry.get() == "":
        entry.insert(0, "https://www.bilibili.com/video/BVXXXXXXXXXX")
        entry.config(fg="gray")
 
# 创建窗口
window = tk.Tk()
window.title("Bilibili弹幕云图生成器")
 
# 设置窗口大小和位置
window_width = 300
window_height = 130
screen_width = window.winfo_screenwidth()
screen_height = window.winfo_screenheight()
x = (screen_width // 2) - (window_width // 2)
y = (screen_height // 2) - (window_height // 2)
window.geometry(f"{window_width}x{window_height}+{x}+{y}")
 
# 设置窗口优先级
def on_window_open(event):
    window.attributes('-topmost', False)
    window.attributes('-topmost', True)
    window.bind("<Map>", on_window_open)
 
 
# 创建标签
label = tk.Label(window, text="\n请输入需要解析视频的地址")
label.pack()
 
# 创建输入框
entry = tk.Entry(window, width=50)
entry.insert(0, "https://www.bilibili.com/video/BVXXXXXXXXXX")
entry.config(fg="gray")
entry.bind('<FocusIn>', on_focus_in)
entry.bind('<FocusOut>', on_focus_out)
entry.pack(pady=10)
 
# 创建按钮
button = tk.Button(window, text="开始生成", command=generate_wordcloud)
button.pack()
 
# 创建状态标签
status_label = tk.Label(window, text="")
status_label.pack()
 
# 绑定回车键事件
entry.bind('<Return>', generate_wordcloud)
 
# 运行窗口
window.mainloop()