文本词频分析_Python初学_文本词频统计python试验目的-CSDN博客

本文链接：https://blog.csdn.net/RockyT2000/article/details/127740617

目的是将文本的词频进行统计分析，并做成词云

import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
from imageio import imread

# 引入相应的库，jieba是中文分词库，pyplot是作图，wordcloud是做词云，imageio是图像分析

# 首先打开要统计的文本
with open('test.txt','r',encoding='UTF-8') as novelFile:
    novel = novelFile.read()
# 其次将一些视作分隔的标点符号，一些词语导入为list
stopwords = [line.strip() for line in open('stop.txt','r',encoding='UTF-8').readlines()]

# 利用jieba库 进行一个精准分词操作
novelList = list(jieba.lcut(novel))
novelDict = {}

# 进行文本词语统计
for word in novelList:
    if word not in stopwords:
        # 不统计字数为1 的词
        if len(word) == 1:
            continue
        else:
            novelDict[word] = novelDict.get(word,0) + 1


# 对词频进行排序
novelListSorted = list(novelDict.items())
novelListSorted.sort(key= lambda e : e[1],reverse = True)

# 打印前10词频
topWordNum = 0
for topWordNum in novelListSorted[:10]:
    print(topWordNum)

# 作一个对应的条形图
x = [c for c,v in novelListSorted]
y = [v for c,v in novelListSorted]
plt.plot(x[:10],y[:10],color = 'r')
plt.show()


# 背景图片
bg_pic = imread('图1.png')

# 使用自定义字体msyh.ttc
wordcloud = WordCloud(mask=bg_pic,background_color='white',\
                      scale = 1.5,font_path='msyh.ttc').generate(' '.join(novelDict.keys()))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

wordcloud.to_file('父亲.jpg')