词云 wordcloud
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt #绘制图像的模块
import jieba.analyse as anls # 关键词提取
import re
from collections import Counter
import imageio
'''功能描述:
1、读取文本
2、分词
3、加载停用词表
4、去停用词
5、提取关键词2种方法
6、画词云展示
'''
#1、读取文本
text = open("text.txt", 'r', encoding='utf-8').read()
#加载停用词表
stopwords = [line.strip() for line in open('stopWords_CH.txt', encoding='UTF-8').readlines()] # list类型
#分词未去停用词
text_split = jieba.cut(text) # 未去掉停用词的分词结果 list类型
# 加载背景图片
mask=imageio.imread(r'background.png')
#去掉停用词的分词结果 list类型
text_split_no = []
for word in text_split:
if word not in stopwords:
text_split_no.append(word)
#print(text_split_no)
text_split_no_str =' '.join(text_split_no) #list类型分为str
#基于tf-idf提取关键词
print("基于TF-IDF提取关键词结果:")
keywords = []
for x, w in anls.extract_tags(text_split_no_str, topK=20, withWeight=True):
keywords.append(x) #前20关键词组成的list
keywords = ' '.join(keywords) #转为str
print(keywords)
#画词云
wordcloud = WordCloud(background_color="white",
font_path="C:/Windows/Fonts/simfang.ttf", #字体//默认不然会乱码
scale=20, #缩放比例
max_words = 200,
max_font_size = 80,
mask = mask,# 背景图片,不用可以删掉
contour_width = 3,#背景线宽度
contour_color = 'steelblue' #背景线颜色
).generate(keywords)
fileName=keywords[:5]+'.png'
wordcloud.to_file(fileName)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
使用时:
在.py文件目录下:
- text.txt 放文章内容
- stopWords_CH.txt 放停用词,以回车分割
- background.png 放背景轮廓图