#《三国演义》前60回中的高频词统计
import jieba
import os
#1 调取文本,并分词
#os系统设置文件地址可读权限
path1= os.path.expanduser(r"~/Documents/python_project/Auxiliary file/sanguo_60.txt")
article1=open(path1).read() #读出txt
words=jieba.lcut(article1)
#stopwords用来排除单词的文本
path2= os.path.expanduser(r"~/Documents/python_project/Auxiliary file/stopwords.txt")
stopwords=[line.strip() for line in open(path2,'r').readlines()]
#2 计算
word_freq={}
for word in words:
if (word in stopwords) or len(word)==1:
continue
#同类单词合并
elif word=='孔明' or word=='孔明曰' or word=='诸葛亮':
newword='孔明'
elif word=='曹操' or word=='孟德' or word=='丞相' or word=='主公':
newwordword='曹操'
elif word=='刘备' or word=='玄德' or word=='玄德曰' or word=='大耳儿' or word=='先主':
newword='刘备'
elif word=='云长' or word=='关公' or word=='关羽':
newword='关羽'
elif word=='张飞' or word=='翼德':
newword='张飞'
elif word=='赵云' or word=='子龙':
newword='赵云'
elif word=='后主' or word=='刘禅' or word=='阿斗':
newword='刘禅'
else:
newword=word
# word_freq[word]=word_freq.get(word,0)+1
if newword in word_freq:
word_freq[newword]+=1
else:
word_freq[word]=1
#3: 排序
'''freq_word=[]123
for word,freq in word_freq.items():
freq_word.append((word,freq))'''
freq_word= list(word_freq.items())
freq_word.sort(key=lambda x: x[1],reverse=True)
#4:输入和显示
max_num=eval(input("请输入显示前多少位高频词: "))
for word,freq in freq_word[:max_num]:
print(word,freq)