最近,2023年的中央一号文件发布,对于这个文本,我们怎么分析呢?词频统计分析不失为一种分析方法,下面是具体的操作。
# main()
def main(filename):
n = eval(input('please enter a number you want see:'))
words = get_word(filename)
word_count(words,n)
word_cloud(words)
def get_word(filename):
f = open(filename, 'r', encoding='utf-8')
txt = f.read()
import jieba
f.close()
word_cut = jieba.lcut(txt)
return word_cut
def word_count(words,n):
from collections import Counter # 以Counter函数计数,代码量更少
counts = Counter(words) # 弊端,将括号、标点符号统计其中,需要予以剔除
for ch in "、。,!“()和的“等”": # 删除不需要出现的冗余部分
del counts[ch]
for word,count in counts.most_common(n): # counts.most_common函数统计键值对
print("{0:<15}{1:>5}".format(word, count))
def word_cloud(words):
del_words = []
for word in words:
if len(word) == 1:
continue
else:
del_words.append(word)
text = ' '.join(del_words)
from wordcloud import WordCloud
# import imageio.v2 as imageio
# mask = imageio.imread()
wordcloud = WordCloud(
background_color='white',
width=800,
height=800,
max_words=200,
max_font_size=100,
font_path='msyh.ttc'
).generate(text)
# wordcloud.to_file(r'D:\Pycharm\PythonProject\Examples\Material\png\中央一号文件词云.png')
# pic = wordcloud.to_image() # 保存为一张图片
# pic.show() # 展示这张图片