网上看到的爬取教程接口大都失效了,这次自己整一下,就当学习笔记了
自己在寻找弹幕的时候耗了很长时间,老想在视频上找到弹幕的加载地址……
其实弹幕就在右边
1.png
其实好多实现还是利用原来的
代码如下:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import jieba
from pyquery import PyQuery as pq
from urllib.parse import urlencode
import datetime
def get_html(url):
try:
headers = {
'Cookie': 'b LIVE_BUVID__ckMd5=7776ad817b9e0091; bp_t_offset_328350021=150073248314016020; _dfcaptcha=29276d4b1897beac8fcc8bb55f8ecdce',
'Host': 'api.bilibili.com',
'Origin': 'https://www.bilibili.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.content
else:
return None
except:
print("Connet_Error")
def get_text(html):
doc = pq(html)
items = doc('i d').items()
for item in items:
yield item.text()
def create_date(datestart = None,dateend = None):
# 创建日期表
if datestart is None:
datestart = '2018-01-01'
if dateend is None:
dateend = datetime.datetime.now().strftime('%Y-%m-%d')
# 转为日期格式
datestart=datetime.datetime.strptime(datestart,'%Y-%m-%d') #字符串格式转化为日期格式的函数
dateend=datetime.datetime.strptime(dateend,'%Y-%m-%d')
date_list = []
date_list.append(datestart.strftime('%Y-%m-%d'))
while datestart
# 日期叠加一天
datestart+=datetime.timedelta(days=+1)
# 日期转字符串存入列表
date_list.append(datestart.strftime('%Y-%m-%d'))
return date_list
def save_to_file(content):
with open('1.txt', 'a', encoding='utf-8') as f: #编码方式一定要选
f.write(content + '\n')
f.close()
def wordcloud(all_comments):
# 对句子进行分词,加载停用词
# 打开和保存文件时记得加encoding='utf-8'编码,不然会报错。
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 精确模式
stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()] # 这里加载停用词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
for line in all_comments:
line_seg = seg_sentence(line) # 这里的返回值是字符串
with open('outputs.txt', 'a', encoding='utf-8') as f:
f.write(line_seg + '\n')
data = open('outputs.txt', 'r', encoding='utf-8').read()
my_wordcloud = WordCloud(
background_color='white', #设置背景颜色
max_words=200, #设置最大实现的字数
font_path=r'SimHei.ttf', #设置字体格式,如不设置显示不了中文
).generate(data)
plt.figure()
plt.imshow(my_wordcloud)
plt.axis('off')
plt.show() # 展示词云
def main():
base_url = "https://api.bilibili.com/x/v2/dm/history?"
date_list = create_date("2018-08-06")#设置开始时间,生成时间列表
for day in date_list:
params = {
'type': '1',
'oid': '23347802',
'date': day
}
params = urlencode(params)
url = base_url + params
print(url)
html = get_html(url)
for item in get_text(html):
save_to_file(item)
f = open(r"E:\parser\b站弹幕\1.txt", 'r', encoding='utf-8')
lines = f.readlines()
wordcloud(lines)
f.close()
if __name__ == "__main__":
main()
先把弹幕内容存进 txt 文件里,之后再读取,快些?
结果如下:
Figure_1.png