import jieba
from wordcloud import WordCloud
import numpy as np
from PIL import Image
from matplotlib import colors
# 读取全部词语
f = open(r'女士/男性护肤品前20评论.txt', "r", encoding="utf-8") # 设置文件对象
text = f.read() # 将txt文件的所有内容读入到字符串str中
f.close()
words_list_jieba = jieba.lcut(text)
def findifhave(demo, stop):
for ret in stop:
if (demo == ret):
return 'T'
# 读取停用词
stop = ['\n','\t',' '] # stop=['\n','我'] 如想过滤‘我’字: ①过滤的方法一:加了一个过滤词‘我’;也可过滤多个词,如:stop = ['\n','\t','肺炎']
with open("stop words.txt", 'r', encoding='utf-8') as f1: # ②过滤的方法二:在‘stop words’里加一个‘我’字;若想删第二个,换行写下一个词
for line in f1:
stop.append(line.replace("\n", ""))
f1.close()
dict = {} # 这里建一个列表用于记录表中的所有唯一值出现的次数
for key in words_list_jieba:
dict[key] = dict.get(key, 0) + 1
for demo in list(dict.keys()):
if ('T' == findifhave(demo, stop)):
del dict[demo]
# 字典排序
dict1 = sorted(dict.items(), key=lambda d: d[1], reverse=True)
print(dict1)
python词频
于 2023-04-13 11:37:05 首次发布