html 实现b站弹幕,爬取B站弹幕并生成词云

网上看到的爬取教程接口大都失效了,这次自己整一下,就当学习笔记了

自己在寻找弹幕的时候耗了很长时间,老想在视频上找到弹幕的加载地址……

其实弹幕就在右边

39158eb6836d

1.png

其实好多实现还是利用原来的

代码如下:

from wordcloud import WordCloud

import matplotlib.pyplot as plt

import requests

import jieba

from pyquery import PyQuery as pq

from urllib.parse import urlencode

import datetime

def get_html(url):

try:

headers = {

'Cookie': 'b LIVE_BUVID__ckMd5=7776ad817b9e0091; bp_t_offset_328350021=150073248314016020; _dfcaptcha=29276d4b1897beac8fcc8bb55f8ecdce',

'Host': 'api.bilibili.com',

'Origin': 'https://www.bilibili.com',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

}

response = requests.get(url, headers=headers)

if response.status_code == 200:

response.encoding = response.apparent_encoding

return response.content

else:

return None

except:

print("Connet_Error")

def get_text(html):

doc = pq(html)

items = doc('i d').items()

for item in items:

yield item.text()

def create_date(datestart = None,dateend = None):

# 创建日期表

if datestart is None:

datestart = '2018-01-01'

if dateend is None:

dateend = datetime.datetime.now().strftime('%Y-%m-%d')

# 转为日期格式

datestart=datetime.datetime.strptime(datestart,'%Y-%m-%d') #字符串格式转化为日期格式的函数

dateend=datetime.datetime.strptime(dateend,'%Y-%m-%d')

date_list = []

date_list.append(datestart.strftime('%Y-%m-%d'))

while datestart

# 日期叠加一天

datestart+=datetime.timedelta(days=+1)

# 日期转字符串存入列表

date_list.append(datestart.strftime('%Y-%m-%d'))

return date_list

def save_to_file(content):

with open('1.txt', 'a', encoding='utf-8') as f: #编码方式一定要选

f.write(content + '\n')

f.close()

def wordcloud(all_comments):

# 对句子进行分词,加载停用词

# 打开和保存文件时记得加encoding='utf-8'编码,不然会报错。

def seg_sentence(sentence):

sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 精确模式

stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()] # 这里加载停用词的路径

outstr = ''

for word in sentence_seged:

if word not in stopwords:

if word != '\t':

outstr += word

outstr += " "

return outstr

for line in all_comments:

line_seg = seg_sentence(line) # 这里的返回值是字符串

with open('outputs.txt', 'a', encoding='utf-8') as f:

f.write(line_seg + '\n')

data = open('outputs.txt', 'r', encoding='utf-8').read()

my_wordcloud = WordCloud(

background_color='white', #设置背景颜色

max_words=200, #设置最大实现的字数

font_path=r'SimHei.ttf', #设置字体格式,如不设置显示不了中文

).generate(data)

plt.figure()

plt.imshow(my_wordcloud)

plt.axis('off')

plt.show() # 展示词云

def main():

base_url = "https://api.bilibili.com/x/v2/dm/history?"

date_list = create_date("2018-08-06")#设置开始时间,生成时间列表

for day in date_list:

params = {

'type': '1',

'oid': '23347802',

'date': day

}

params = urlencode(params)

url = base_url + params

print(url)

html = get_html(url)

for item in get_text(html):

save_to_file(item)

f = open(r"E:\parser\b站弹幕\1.txt", 'r', encoding='utf-8')

lines = f.readlines()

wordcloud(lines)

f.close()

if __name__ == "__main__":

main()

先把弹幕内容存进 txt 文件里,之后再读取,快些?

结果如下:

39158eb6836d

Figure_1.png

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值