html 实现b站弹幕,爬取B站弹幕并生成词云

最新推荐文章于 2024-04-08 07:39:13 发布

防晒霜白癜风患者

最新推荐文章于 2024-04-08 07:39:13 发布

阅读量327

点赞数

文章标签： html 实现b站弹幕

网上看到的爬取教程接口大都失效了，这次自己整一下，就当学习笔记了

自己在寻找弹幕的时候耗了很长时间，老想在视频上找到弹幕的加载地址……

其实弹幕就在右边

39158eb6836d

1.png

其实好多实现还是利用原来的

代码如下：

from wordcloud import WordCloud

import matplotlib.pyplot as plt

import requests

import jieba

from pyquery import PyQuery as pq

from urllib.parse import urlencode

import datetime

def get_html(url):

try:

headers = {

'Cookie': 'b LIVE_BUVID__ckMd5=7776ad817b9e0091; bp_t_offset_328350021=150073248314016020; _dfcaptcha=29276d4b1897beac8fcc8bb55f8ecdce',

'Host': 'api.bilibili.com',

'Origin': 'https://www.bilibili.com',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

}

response = requests.get(url, headers=headers)

if response.status_code == 200:

response.encoding = response.apparent_encoding

return response.content

else:

return None

except:

print("Connet_Error")

def get_text(html):

doc = pq(html)

items = doc('i d').items()

for item in items:

yield item.text()

def create_date(datestart = None,dateend = None):

# 创建日期表

if datestart is None:

datestart = '2018-01-01'

if dateend is None:

dateend = datetime.datetime.now().strftime('%Y-%m-%d')

# 转为日期格式

datestart=datetime.datetime.strptime(datestart,'%Y-%m-%d') #字符串格式转化为日期格式的函数

dateend=datetime.datetime.strptime(dateend,'%Y-%m-%d')

date_list = []

date_list.append(datestart.strftime('%Y-%m-%d'))

while datestart

# 日期叠加一天

datestart+=datetime.timedelta(days=+1)

# 日期转字符串存入列表

date_list.append(datestart.strftime('%Y-%m-%d'))

return date_list

def save_to_file(content):

with open('1.txt', 'a', encoding='utf-8') as f: #编码方式一定要选

f.write(content + '\n')

f.close()

def wordcloud(all_comments):

# 对句子进行分词，加载停用词

# 打开和保存文件时记得加encoding='utf-8'编码，不然会报错。

def seg_sentence(sentence):

sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 精确模式

stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()] # 这里加载停用词的路径

outstr = ''

for word in sentence_seged:

if word not in stopwords:

if word != '\t':

outstr += word

outstr += " "

return outstr

for line in all_comments:

line_seg = seg_sentence(line) # 这里的返回值是字符串

with open('outputs.txt', 'a', encoding='utf-8') as f:

f.write(line_seg + '\n')

data = open('outputs.txt', 'r', encoding='utf-8').read()

my_wordcloud = WordCloud(

background_color='white', #设置背景颜色

max_words=200, #设置最大实现的字数

font_path=r'SimHei.ttf', #设置字体格式，如不设置显示不了中文

).generate(data)

plt.figure()

plt.imshow(my_wordcloud)

plt.axis('off')

plt.show() # 展示词云

def main():

base_url = "https://api.bilibili.com/x/v2/dm/history?"

date_list = create_date("2018-08-06")#设置开始时间，生成时间列表

for day in date_list:

params = {

'type': '1',

'oid': '23347802',

'date': day

}

params = urlencode(params)

url = base_url + params

print(url)

html = get_html(url)

for item in get_text(html):

save_to_file(item)

f = open(r"E:\parser\b站弹幕\1.txt", 'r', encoding='utf-8')

lines = f.readlines()

wordcloud(lines)

f.close()

if __name__ == "__main__":

main()

先把弹幕内容存进 txt 文件里，之后再读取，快些？

结果如下：

39158eb6836d

Figure_1.png

防晒霜白癜风患者

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫