1.这里先爬取某视频历史弹幕,具体方法可见之前文章
for i in range(10,16): #爬取10~15号弹幕
url = f'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid=1399395673&date=2024-01-{i}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Cookie': '浏览器中cookie'
}
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8' # 进行转码防止乱码
content_list = re.findall('[\u4e00-\u9fa5]+', response.text) # 正则表达式只匹配中文
content = ' '.join(content_list)
with open('历史弹幕.txt', mode='a', encoding='utf_8') as f:
f.write(content)
2. 使用jieba分词,for循环统计词频,去除单个文字的文本数据。(这里我没用到停用词)
f = open('历史弹幕.txt', encoding='utf-8')
text = f.read()
words = lcut(text)
for word in words:
if len(word) == 1: #这里去除单个子的文本数据
continue
else:
word_dict[word] = word_dict.get(word,0)+1
sort_list = sorted(word_dict.items(),key=lambda x:x[1],reverse=True)
for word,count in sort_list:
print('{:<10}{:<10}'.format(word,count)) #以左对齐字符串限制最大值输出
3.完整代码
import requests
import re
from jieba import *
word_dict = {}
def get_content(): #爬取弹幕信息
for i in range(10,16): #爬取10~15号弹幕
url = f'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid=1399395673&date=2024-01-{i}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Cookie': '浏览器中cookie'
}
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8' # 进行转码防止乱码
content_list = re.findall('[\u4e00-\u9fa5]+', response.text) # 正则表达式只匹配中文
content = ' '.join(content_list)
with open('历史弹幕.txt', mode='a', encoding='utf_8') as f:
f.write(content)
def word_count():
f = open('历史弹幕.txt', encoding='utf-8')
text = f.read()
words = lcut(text)
for word in words:
if len(word) == 1: #这里去除单个子的文本数据
continue
else:
word_dict[word] = word_dict.get(word,0)+1
sort_list = sorted(word_dict.items(),key=lambda x:x[1],reverse=True)
for word,count in sort_list:
print('{:<10}{:<10}'.format(word,count)) #以左对齐字符串限制最大值输出
get_content()
word_count()
4.输出结果