数据来源
首先,咱们吧a站视频的弹幕爬取下来先,爬取方法在上一篇文章中: Python爬取a站视频弹幕
分词
要绘制词云图,首先咱们得对弹幕进行分词统计,这里用到了jieba库,jieba的详细介绍可以看https://github.com/fxsjy/jieba
咱们这里用jieba.cut()
来切分,接着咱们需要对切分完的单词进行筛选,以为并不是所有单词但是有意义的,所以咱们在这里去除了一些停用词以及只有一个字的词。直接上代码吧:
def word_cut(wordsPath, stopwordsPath):
# 读取弹幕信息
danmu = []
with open(wordsPath, 'r', encoding='utf-8') as csvfile:
lines = csv.reader(csvfile)
for line in lines:
danmu.append(line[2])
text = ",".join(danmu[1:])
# 进行分词
seg_list = "_".join(jieba.cut(text,cut_all=False))
# 导入停用词列表
stopwords = [line.strip() for line in open(stopwordsPath,encoding='UTF-8').readlines()]
# 去停用词
wordList = []
for word in seg_list.split('_'):
#去除停用词以及单个字
if not(word.split()) in stopwords and len(word.strip())>1:
wordList.append(word)
return wordList
绘制词云图
绘制词云图用到了wordcloud库,同样的,详细介绍可以看http://amueller.github.io/word_cloud/,
这里咱们简单应用就行,直接上代码吧:
def wordcloud_image(wordList,savePath):
# 配置对象参数
w = wordcloud.WordCloud(width=1000, height=700, font_path="msyh.ttc", background_color="white")
# 加载词云文本
w.generate(" ".join(wordList))
# 输出词云文件
w.to_file(savePath)
获取弹幕路径
另外,咱们这里是需要进行批处理的,所有自动获取文件的路径及文件名,这样咱们就可以一次性生成多张词云图
def getCSVPaths(dirPath):
# 遍历文件夹,筛选出csv文件
paths = []
for folderName, _, fileNames in os.walk(dirPath):
for filename in fileNames:
if filename.endswith('.csv'):
paths.append((folderName,filename))
return paths
main方法
if __name__ == '__main__':
# 停用词路径
stopwordsPath = 'G:\\桌面文件\\爬取a站弹幕\\stopwords.txt'
# 词云图文件夹
wordcloud_image_dir = 'G:\\桌面文件\\爬取a站弹幕\\wordcloud_image'
for folderName,filename in getCSVPaths('G:\\桌面文件\\爬取a站弹幕'):
wordsPath = os.path.join(folderName, filename)
filename = filename.split(".")[0]
savePath = os.path.join(wordcloud_image_dir, filename+".jpg")
print("正在生成{}的词云图中".format(filename))
wordList = word_cut(wordsPath, stopwordsPath)
wordcloud_image(wordList,savePath)
运行结果
完整代码
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 28 15:29:59 2021
@author: feng
"""
import os
import csv
import jieba
import wordcloud
def word_cut(wordsPath, stopwordsPath):
# 读取弹幕信息
danmu = []
with open(wordsPath, 'r', encoding='utf-8') as csvfile:
lines = csv.reader(csvfile)
for line in lines:
danmu.append(line[2])
text = ",".join(danmu[1:])
# 进行分词
seg_list = "_".join(jieba.cut(text,cut_all=False))
# 导入停用词列表
stopwords = [line.strip() for line in open(stopwordsPath,encoding='UTF-8').readlines()]
# 去停用词
wordList = []
for word in seg_list.split('_'):
#去除停用词以及单个字
if not(word.split()) in stopwords and len(word.strip())>1:
wordList.append(word)
return wordList
def wordcloud_image(wordList,savePath):
# 配置对象参数
w = wordcloud.WordCloud(width=1000, height=700, font_path="msyh.ttc", background_color="white")
# 加载词云文本
w.generate(" ".join(wordList))
# 输出词云文件
w.to_file(savePath)
def getCSVPaths(dirPath):
# 遍历文件夹,筛选出csv文件
paths = []
for folderName, _, fileNames in os.walk(dirPath):
for filename in fileNames:
if filename.endswith('.csv'):
paths.append((folderName,filename))
return paths
if __name__ == '__main__':
# 停用词路径
stopwordsPath = 'G:\\桌面文件\\爬取a站弹幕\\stopwords.txt'
# 词云图文件夹
wordcloud_image_dir = 'G:\\桌面文件\\爬取a站弹幕\\wordcloud_image'
for folderName,filename in getCSVPaths('G:\\桌面文件\\爬取a站弹幕'):
wordsPath = os.path.join(folderName, filename)
filename = filename.split(".")[0]
savePath = os.path.join(wordcloud_image_dir, filename+".jpg")
print("正在生成{}的词云图中".format(filename))
wordList = word_cut(wordsPath, stopwordsPath)
wordcloud_image(wordList,savePath)