中文词频统计
需要下载jieba
import jieba
with open('threekingdoms.txt','r',encoding='UTF-8') as fr:
s=fr.read()
ls=jieba.lcut(s)
excludes=['二人','不可','荆州','却说']
counts={}
for word in ls:
if len(word)==1:
continue
elif word=='诸葛亮' or word=='孔明曰':
rword='孔明'
elif word=='关公' or word=='云长':
rword='关羽'
elif word=='玄德' or word=='玄德曰':
rword='刘备'
elif word=='孟德' or word=='丞相':
rword='曹操'
else:
rword=word
counts[rword]=counts.get(rword,0)+1
#去掉一些不是要找的
for word in excludes:
del counts[word]
lst=list(counts.items())
lst.sort(key=lambda x:x[1],reverse=True)
#排序,输出前十五个人物
for i in range(15):
key,value=lst[i]
print('{0:<5}{1:>5}'.format(key,value))