词语图和词频统计结果有出入怎么办

最新推荐文章于 2024-11-02 16:28:26 发布

只有风

最新推荐文章于 2024-11-02 16:28:26 发布

阅读量1.2k

点赞数 47

文章标签： python

本文链接：https://blog.csdn.net/weixin_73637464/article/details/136556336

版权

# 获取HLM文本数据
with open('HLM.txt', 'r', encoding='utf-8')as f:
    text = f.read()
import jieba
# 分词并统计词频
def wordFreq(text,topn):
    words = jieba.lcut(text.strip()) # 对文本进行分词操作
    # 加载停用词库
    stopwords = [line.strip() for line in open('停用词库.txt','r',encoding='utf-8').readlines()]
    counts = {}
    for word in words:  # 统计每个词出现的频率，存放在字典counts中
        if len(word) == 1:  # 如果该词的长度为1，则跳过，不参与统计。
            continue
        elif word not in stopwords:  # 如果该词不在停用词列表stopwords中，才参与统计
            counts[word] = counts.get(word,0) + 1
    items = list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)  # 按照词频进行排序
    f = open('HLM_词频.txt','w',encoding='utf-8')
    for i in range(topn):  # topn表示要取的词的个数，将频率最高的topn个词及其频率数存放在文件中
        word,count = items[i]
        f.writelines("{}\t{}\n".format(word,count))
    f.close()

wordFreq(text, 20)  # 这里我们提取出频率最高的前20个词
import matplotlib.pyplot as plt
import wordcloud
import imageio
wordFreq(text,100)  # 获取TOP500的词频
word_cloud_text = open('HLM_词频.txt','r',encoding='utf-8').read()
bg_pic = imageio.imread('star.jpg') # 读入形状图片
wc = wordcloud.WordCloud(font_path=r'C:\Windows\Fonts\simhei.ttf',
                            background_color='white',
                            width=1000,
                            max_words=200,
                            mask=bg_pic,  # mask参数设置词云形状
                            height=860,
                            margin=2
                            ).generate(word_cloud_text)
wc.to_file('HLMcloud_star.png')  # 保存图片