from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
# 设置请求头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
'referer': 'https://www.bilibili.com',
}
# 定义原始视频链接
video_url = 'https://www.bilibili.com/video/BV1y94y1P7V5?p=5&vd_source=c2059fd378b370efe9d97ee1903c9680'
# 初始化浏览器驱动
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无头模式,不显示浏览器界面
driver = webdriver.Chrome()
# 发送请求获取视频页面内容
driver.get(video_url)
time.sleep(5) # 等待页面加载,可根据实际情况调整时间
# 获取页面源代码并解析
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# 获取所有data-cid值并去重
data_cids = set()
for tag in soup.find_all(attrs={'data-cid': True}):
data_cids.add(tag['data-cid'])
# 将data-cid值写入文本文件
with open('data_cids1.txt', 'w', encoding='utf-8') as f:
for cid in data_cids:
f.write(cid + '\n')
import requests
from bs4 import BeautifulSoup as Bs
import time
import json
# 定义视频地址和请求头信息
cid_list = ['1361106922', '1361094183', '1361106754', '1361106735', '1361106809', '1361106932', '1361106663']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
}
# 初始化列表和计数器
data_list = []
id_counter = 1
# 遍历每个视频的cid
for cid in cid_list:
# 根据cid号,拼接xml接口地址,并发送请求
danmu_url = 'http://comment.bilibili.com/{}.xml'.format(cid) # 弹幕地址
print('弹幕地址是:', danmu_url)
r = requests.get(danmu_url)
# 解析xml页面
soup = Bs(r.text, 'xml')
danmu_list = soup.find_all('d')
print('共爬取到{}条弹幕'.format(len(danmu_list)))
# 初始化视频信息字典
video_info = {
'id': id_counter, # 视频ID自增
'video_name': '【2023和平精英全球总决赛】12月9日 总决赛DAY2', # 视频名称
'url': 'https://www.bilibili.com/video/BV1y94y1P7V5?p=5&vd_source=c2059fd378b370efe9d97ee1903c9680', # 视频链接
'danmu_url': danmu_url, # 弹幕链接
'danmu': [] # 存储弹幕时间和弹幕内容的列表
}
# 提取弹幕信息并存入列表
for d in danmu_list:
data_split = d['p'].split(',')
danmu_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(data_split[4])))
danmu_content = d.text.encode('raw_unicode_escape').decode('utf-8') # 处理编码问题
video_info['danmu'].append({
'time': danmu_time,
'content': danmu_content
})
print('{}: {}'.format(danmu_time, danmu_content))
# 将视频信息字典添加到列表中
data_list.append(video_info)
# 自增ID计数器
id_counter += 1
# 写入JSON文件
output_file = 'bili.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data_list, f, ensure_ascii=False, indent=4)
print(f'弹幕数据已写入{output_file}文件中')
import json
# 读取 JSON 文件
input_file = 'bili.json'
with open(input_file, 'r', encoding='utf-8') as f:
data_list = json.load(f)
# 遍历每个视频信息,对弹幕按照时间排序
for video_info in data_list:
danmu_list = video_info['danmu']
sorted_danmu_list = sorted(danmu_list, key=lambda x: x['time'])
video_info['danmu'] = sorted_danmu_list
# print(json.dumps(data_list[0], ensure_ascii=False, indent=4))
# 可以将排序后的数据写入新的 JSON 文件
output_file_sorted = 'bili_326.json'
with open(output_file_sorted, 'w', encoding='utf-8') as f:
json.dump(data_list, f, ensure_ascii=False, indent=4)
print(f'排序后的弹幕数据已写入 {output_file_sorted} 文件中')