基于Python的文本分词与词云生成——以QQ群聊天记录为例
导入词云制作库wordcloud、中文分词库jieba和re模块等
import wordcloud
import jieba
import re
import imageio
jupyter魔法指令
%reload_ext lab_black
imread函数读取本地图片,作为词云形状图片
mk = imageio.imread(“alice.png”)
构建词云对象w,设置词云图片宽、高、字体、背景颜色等参数
w = wordcloud.WordCloud(width=1000,
height=700,
background_color=“white”,
font_path=“msyh.ttc”,
scale=3,
mask=mk,
stopwords={“表情”, “图片”, “撤回”, “一条”, “消息”, “一个”},
contour_width=1,
contour_color=“steelblue”)
加载QQ群聊学习记录文件
f = open(“网工192的渣男渣女们②.txt”, encoding=“utf-8”)
txt_content = f.readlines()
数据初步清洗
txt_content = txt_content[8:] # 过滤群说明(分组,对象)
txtlist = []
for line in txt_content:
# 替换字符串空行,正则匹配日期、时间、用户名
lines = re.sub("\d{4}-\d{2}-\d{2} \d{1,2}:\d{2}:\d{2} .*", “”,
line.strip())
txtlist.append(lines)
加载HIT中文停用词文件
f = open(“chinese_stop_words.txt”, encoding=“utf-8”)
stop_words = str(f.readlines()).strip()
中文分词,数据深度清洗
txtlist = jieba.lcut(str(txtlist))
string = str(txtlist).split(" ")
string = " ".join([word for word in txtlist
if word not in stop_words]) # 过滤中文停用词
向词云对象中传入数据
w.generate(string)
导出词云图片
w.to_file(“output.png”)