中英文词频统计及画词云图

最新推荐文章于 2024-06-28 09:48:33 发布

Python_QB

最新推荐文章于 2024-06-28 09:48:33 发布

阅读量1.2k

点赞数 2

文章标签： python 开发语言

本文链接：https://blog.csdn.net/qq_33267306/article/details/127396463

版权

一、英文的词频统计

import jieba
import numpy as np
import PIL.Image as Image
from wordcloud import WordCloud


#读取stopwords.txt
with open('stopwords.txt','r',encoding ='utf-8') as f:
    st = f.readlines()

stopwords = [i.strip() for i in st]
stopwords.append('\n')

# 导入
with open("the little prince.txt", 'r') as file:
    datas = file.readlines()
    

# 处理
words = []
for i in datas:
    words += i.replace("\n", "").replace(".", "").replace(":", "").replace("...", "").split(" ")
    
#去除停用词
for j in stopwords:
    if j in words:
        #将words中的每一个j都删除
        for k in range(words.count(j)):
            words.remove(j)

dict_counts = {}
for word in words:
    #如果是一个字的就不统计了
    if len(word)>1:
        dict_counts[word] = words.count(word)
#print(words)
#排序
lists = list(dict_counts.items())
lists.sort(key = lambda x:x[-1],reverse = True)

#输出前n条的数据
def put(n):
    global lists
    word_s = []
    for i in range(n):
        word_s.append(lists[i][0])
        print(lists[i])
    word_space = ' '.join(word_s)
    return word_space
res = put(50)

pic = np.array(Image.open("pig.jpg"))
wordclo = WordCloud(
    font_path='impact.ttf',  # 设置字体，本机的字体
    mask=pic,  # 设置背景图片
    background_color='white',  # 设置背景颜色
    max_font_size=180,  # 字体最大值
    max_words=1000,  # 设置最多字数
    stopwords={'i'}  # 设置停用词，不出现
                  ).generate(res)
image = wordclo.to_image()
image.show()  #显示图片
wordclo.to_file('result1.png')

其中the little prince.txt是这样的：

stopwords.txt是这样的：

最后统计出是：

输出的图片：

二、中文分词

中文分词需要用到jieba.lcut()

其他基本类似

import jieba
import numpy as np
import PIL.Image as Image
from wordcloud import WordCloud

#读取stopwords.txt
with open('stopwords.txt','r',encoding ='utf-8') as f:
    st = f.readlines()

stopwords = [i.strip() for i in st]
stopwords.append('\n')
#stopwords = [i.strip() for i in open('stopwords.txt', encoding='UTF-8').readlines()]

#读取data.txt
with open('da.txt','r',encoding ='utf-8') as f:
    w = f.read()
    
    
words = jieba.lcut(w)
#print(words)

#去除停用词
for j in stopwords:
    if j in words:
        #将words中的每一个j都删除
        for k in range(words.count(j)):
            words.remove(j)

dict_counts = {}
for word in words:
    #如果是一个字的就不统计了
    if len(word)>1:
        dict_counts[word] = dict_counts.get(word,0)+1
        dict_counts[word] = words.count(word)
#print(words)
#排序
lists = list(dict_counts.items())
lists.sort(key = lambda x:x[-1],reverse = True)

#输出前n条的数据
def put(n):
    global lists
    word_s = []
    for i in range(n):
        word_s.append(lists[i][0])
        print(lists[i])
    word_space = ' '.join(word_s)
    return word_space
res = put(100)



pic = np.array(Image.open("mo.jpg"))
wordclo = WordCloud(
    font_path='STXINGKA.TTF',  # 设置字体，本机的字体
    mask=pic,  # 设置背景图片
    background_color='white',  # 设置背景颜色
    max_font_size=180,  # 字体最大值
    max_words=1000,  # 设置最多字数
    stopwords={'呢'}  # 设置停用词，不出现
                  ).generate(res)
image = wordclo.to_image()
image.show()  #显示图片
wordclo.to_file('result.png')

打印结果：