import jiebb
excludes = {"他们","众人","姑娘","这里","我们","那里","知道","起来","什么","一个","如今","说道","自己","奶奶","两个"}
txt = open("红楼梦.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt)#精确模式的分词函数,返回一个列表数据类型
#print(type(words)) #words的数据类型
counts = {} #定义一个字典
for word in words:
if len(word) == 1:
continue
elif word == "宝二爷" or word=="贾宝玉" :
rword =="宝玉"
elif word == "林姑娘" or word=="黛玉":
rword ="林黛玉"
elif word == "宝姑娘" or word =="宝钗":
rword ="薛宝钗"
elif word == "凤姐" or word =="凤姐儿":
rword ="王熙凤"
elif word == "老祖宗" or word == "贾母":
rword = "贾母"
elif word == "芙蓉仙子" or word == "病西施":
rword = "晴雯"
elif word == "史大姑娘" or word =="湘云":
rword ="史湘云"
elif word =="惜春"or word == "贾惜春":
rword ="贾惜春"
elif word =="迎春"or word == "贾迎春":
rword ="贾迎春"
else:
rword = word
counts[rword] = counts.get(rword,0) + 1 #词汇加入字典
for word in excludes:
del(counts[word]) #从字典中删除无用词
items = list(counts.items())#字典转换为列表
#lambda是一个隐函数,是固定写法,以下命令的意思就是按照记录的第2列排序
"""x表示列表中的一个元素,x只是临时起的一个名字,
你可以使用任意的名字"""
items.sort(key=lambda x:x[1], reverse=True)
for i in range(20): #出现的词频统计
word, count = items[i] #将键和值分别赋予列表word和count
print ("{0:<10}{1:>5}".format(word, count))#0:<10左对齐,宽度10,”>5"右对齐