目的是将文本的词频进行统计分析,并做成词云
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
from imageio import imread
# 引入相应的库,jieba是中文分词库,pyplot是作图,wordcloud是做词云,imageio是图像分析
# 首先打开要统计的文本
with open('test.txt','r',encoding='UTF-8') as novelFile:
novel = novelFile.read()
# 其次将一些视作分隔的标点符号,一些词语导入为list
stopwords = [line.strip() for line in open('stop.txt','r',encoding='UTF-8').readlines()]
# 利用jieba库 进行一个精准分词操作
novelList = list(jieba.lcut(novel))
novelDict = {}
# 进行文本词语统计
for word in novelList:
if word not in stopwords:
# 不统计字数为1 的词
if len(word) == 1:
continue
else:
novelDict[word] = novelDict.get(word,0) + 1
# 对词频进行排序
novelListSorted = list(novelDict.items())
novelListSorted.sort(key= lambda e : e[1],reverse = True)
# 打印前10词频
topWordNum = 0
for topWordNum in novelListSorted[:10]:
print(topWordNum)
# 作一个对应的条形图
x = [c for c,v in novelListSorted]
y = [v for c,v in novelListSorted]
plt.plot(x[:10],y[:10],color = 'r')
plt.show()
# 背景图片
bg_pic = imread('图1.png')
# 使用自定义字体msyh.ttc
wordcloud = WordCloud(mask=bg_pic,background_color='white',\
scale = 1.5,font_path='msyh.ttc').generate(' '.join(novelDict.keys()))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.to_file('父亲.jpg')