统计文本中出现频率最高的K个词汇或某个词汇出现的概率
import jieba
#配置
# 文件地址
path = r"D:\code\project\xkp\report\ln2014.txt"
# 是否开启冗余模式, 开启后“企业结构”会划分为“企业结构、企业、结构”
res = False
# 输出TOP_K的词汇及其频率
top_k = 10
# 查询词汇
find_word = "企业"
txt = open(path, "r", encoding='utf-8').read()
words = jieba.lcut(txt, cut_all=res)
counts = {}
for word in words:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(top_k):
word, count = items[i]
print ("{0:<10}{1:>5}".format(word, count))
occ_num = 0
for word in words:
if len(word) == 1:
continue
else:
if word == find_word:
occ_num += 1
print("---" * 10)
print("查找词汇为:{}, 出现的次数为:{}, 是否开启冗余模式: {}".format(find_word, occ_num, res))