1.首先获取某一视频的弹幕
用此方法获取B站视频的XML弹幕
2.弹幕解析部分
将xml弹幕解析,并写入csv文件
代码部分:
import xml.etree.ElementTree as ET
import csv
import time
tree = ET.ElementTree(file='Test.xml')
root = tree.getroot()
# print(root.tag,root.text,root.attrib)
f = open('哔哩哔哩XML弹幕输出.csv','w',encoding='utf-8',newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(["弹幕内容","秒数","弹幕模式","字号","字体颜色","时间戳","弹幕池","发送者ID","弹幕在弹幕数据库中rowID"])
neirong = root.findall('d')
for i in neirong:
# print(type(i.attrib)) 字典类型
notcut = i.attrib['p']
cut = notcut.split(",")
unix_timestamp = int(cut[4])
beijing_time = time.localtime(unix_timestamp)
beijing_time = time.strftime("%Y-%m-%d %H:%M:%S",beijing_time)
print(cut)#列表类型
csv_writer.writerow([i.text,cut[0],cut[1],cut[2],cut[3],beijing_time,cut[5],cut[6],cut[7]])
# 第一个参数是弹幕出现的时间 以秒数为单位。
# print(alone[])
# 第二个参数是弹幕的模式1..3 滚动弹幕 4底端弹幕 5顶端弹幕 6.逆向弹幕 7精准定位 8高级弹幕
# 第三个参数是字号, 12非常小,16特小,18小,25中,36大,45很大,64特别大
# 第四个参数是字体的颜色 以HTML颜色的十位数为准
# 第五个参数是Unix格式的时间戳。基准时间为 1970-1-1 08:00:00
# 第六个参数是弹幕池 0普通池 1字幕池 2特殊池 【目前特殊池为高级弹幕专用】
# 第七个参数是发送者的ID,用于“屏蔽此弹幕的发送者”功能
# 第八个参数是弹幕在弹幕数据库中rowID 用于“历史弹幕”功能。
for i in neirong:
# print(i.tag,i.attrib,i.text)
print(i.text)
csv_writer.writerow([i.text])
f.close()
结果如下:
3.从csv文件中读取数据,调用百度情感API接口进行情感分析
代码如下:
import xml.etree.ElementTree as ET
import csv
from aip import AipNlp
import time
tree = ET.ElementTree(file='Test.xml')
root = tree.getroot()
f = open('XML弹幕情感分析.csv','w',encoding='utf-8',newline='')
APP_ID = '这里填入自己注册的百度API口令'
API_KEY = '这里填入自己注册的百度API口令'
SECRET_KEY = '这里填入自己注册的百度API口令'
client = AipNlp(APP_ID, API_KEY, SECRET_KEY)
csv_writer = csv.writer(f)
# csv_writer.writerow(["弹幕内容","秒数","弹幕模式","字号","字体颜色","时间戳","弹幕池","发送者ID","弹幕在弹幕数据库中rowID"])
neirong = root.findall('d')
for i in neirong:
classify = client.sentimentClassify(i.text);
time.sleep(0.3)
if classify["items"][0]['sentiment'] == 0:
sentiment = "消极"
elif classify["items"][0]['sentiment'] == 1:
sentiment = "中性"
else:
sentiment = "积极"
print(classify["text"], sentiment)
csv_writer.writerow([i.text,sentiment])
f.close()
运行结果: