现在bilibili这个网站的视频得到了广大观看者的喜爱,因此我在学习了词云的制作后,开始了对bilibili视频弹幕的抓取和进行词云的制作
GitHub:https://github.com/Spider-Man123/Bilibili-screen-word-cloud
一.对bilibili指定视频的弹幕进行抓取
1.提取视频id
得有一个视频的url,例如:https://www.bilibili.com/video/av71744160/spm_id_from=333.334.b_63686965665f7265636f6d6d656e64.21
使用正则提取出来它的id,例如上面的这个url的id就是71744160
m='https://www.bilibili.com/video/av71744160/spm_id_from=333.334.b_63686965665f7265636f6d6d656e64.21'
try:
n = re.findall("av.*\d/", m)[0].replace('av', '').replace("/", '').replace("?","")
except :
n=re.findall("av.*from",m)[0].replace('av','').replace('?from','')
2.提取视频弹幕id,构造弹幕链接
经过对文件的分析和抓包,得出了结论,弹幕网页的id存在于pagelist开头的这个包中
里面是一个json文件
包对应的url是 https://api.bilibili.com/x/player/pagelist?aid=71744160&jsonp=jsonp
编写代码提取出来json文件里面的cid,然后构造urll,这个网页里面就有弹幕的内容
url='https://api.bilibili.com/x/player/pagelist?aid={}&jsonp=jsonp'.format(n)
r=requests .get(url=url,headers=headers)
urll="https://comment.bilibili.com/{}.xml".format(r.json()['data'][0]['cid'])
3.提取弹幕网页里面的弹幕
rr=requests .get(url=urll,headers=headers)
rr.encoding =rr.apparent_encoding
html=etree.HTML(rr.content )
dan=html.xpath("//d/text()")
4.弹幕抓取完整代码
import requests
from lxml import etree
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import tkinter as tk
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def danmu(m):
try:
n = re.findall("av.*\d/", m)[0].replace('av', '').replace("/", '').replace("?","")
except :
n=re.findall("av.*from",m)[0].replace('av','').replace('?from','')
url='https://api.bilibili.com/x/player/pagelist?aid={}&jsonp=jsonp'.format(n)
r=requests .get(url=url,headers=headers)
urll="https://comment.bilibili.com/{}.xml".format(r.json()['data'][0]['cid'])
rr=requests .get(url=urll,headers=headers)
rr.encoding =rr.apparent_encoding
html=etree.HTML(rr.content )
dan=html.xpath("//d/text()")
return dan
if __name__=='__main__':
m='https://www.bilibili.com/video/av71744160/spm_id_from=333.334.b_63686965665f7265636f6d6d656e64.21'
window.destroy()
list1=danmu(m)
print(list1)
二:对抓取到的数据进行文本存储
for i in list1:
with open('b','a',encoding='utf-8')as fp:
fp.write(i)
三.对数据进行词云显示
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
text = open("b", "rb").read()
wordlist = jieba.cut(text, cut_all=True)
wl = " ".join(wordlist)
wc = WordCloud(background_color="white",
scale=6,
max_words=2000,
font_path="aa.ttf",#需要下载中文字体库
max_font_size=50,
random_state=20,
)
myword = wc.generate(wl)
plt.imshow(myword)
plt.axis("off")
plt.show()
四.编写一个GUI图形界面
import tkinter as tk
window=tk.Tk()
window.title('spider-man.bilibili弹幕')
window.geometry ('400x400')
canvas=tk.Canvas(window,bg='blue',height=135,width=190)
l=tk.Label(window,text='输入网站地址',bg='yellow',font=('Calibri',25),width=20,height=2)
l.pack()
link2=tk.Entry(window,width=60)
link2.pack()
b=tk.Button(window,text='生成弹幕词云',bg='orange',font=('Calibri',25),width=20,height=1,command=kaishi)#command为对应运行的函数
b.pack()
window .mainloop()
五.完整代码
import requests
from lxml import etree
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import tkinter as tk
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def kaishi():
#爬去弹幕的函数
def danmu(m):
try:
n = re.findall("av.*\d/", m)[0].replace('av', '').replace("/", '').replace("?","")
except :
n=re.findall("av.*from",m)[0].replace('av','').replace('?from','')
url='https://api.bilibili.com/x/player/pagelist?aid={}&jsonp=jsonp'.format(n)
r=requests .get(url=url,headers=headers)
urll="https://comment.bilibili.com/{}.xml".format(r.json()['data'][0]['cid'])
rr=requests .get(url=urll,headers=headers)
rr.encoding =rr.apparent_encoding
html=etree.HTML(rr.content )
dan=html.xpath("//d/text()")
return dan
#生成词云的函数
def ciyun():
text = open("b", "rb").read()
wordlist = jieba.cut(text, cut_all=True)
wl = " ".join(wordlist)
wc = WordCloud(background_color="white",
scale=6,
max_words=2000,
font_path="aa.ttf",#需要下载中文字体库
max_font_size=50,
random_state=20,
)
myword = wc.generate(wl)
plt.imshow(myword)
plt.axis("off")
plt.show()
if __name__=='__main__':
m=link2.get()
window.destroy()
list1=danmu(m)
for i in list1:
with open('b','a',encoding='utf-8')as fp:
fp.write(i)
ciyun()
#GUI界面
window=tk.Tk()
window.title('spider-man.bilibili弹幕')
window.geometry ('400x400')
canvas=tk.Canvas(window,bg='blue',height=135,width=190)
l=tk.Label(window,text='输入网站地址',bg='yellow',font=('Calibri',25),width=20,height=2)
l.pack()
link2=tk.Entry(window,width=60)
link2.pack()
b=tk.Button(window,text='生成弹幕词云',bg='orange',font=('Calibri',25),width=20,height=1,command=kaishi)
b.pack()
window .mainloop()
六.运行结果