import jieba
txt =open("D:\红楼梦.txt","r", \
encoding='utf-8').read()#打开文件#排除不是人名的词语
excludes ={"什么","一个","我们","那里","你们","如今","说道","知道",# 去掉无用的词"起来","姑娘","这里","出来","他们","众人","自己","一面","太太","只见","怎么","奶奶","两个","没有","不是","不知","这个","听见","这样","进来","咱们","告诉","就是","东西","回来","只是","大家","老爷","只得","丫头","这些","不敢","出去","所以","不过","的话","不好","姐姐","不能","一时","鸳鸯","过来","心里","二爷","如此","今日","银子","两人","还有","几个","却说","二人","二人","只管","不能","如此","这么","说话","一回","那边","这话","外头","今儿","罢了","屋里","那些","听说","小丫头","打发","自然","如何","问道","看见","人家","不用","媳妇"}
words = jieba.lcut(txt)#精确模式
counts ={}for word in words:#合并指代同一人的不同词语出现次数iflen(word)==1:continueelif word =="老太太"or word =="老祖宗":
rword ="贾母"elif word =="林妹妹"or word =="林姑娘"or word =="黛玉":
reord ="林黛玉"elif word =="宝钗"or word =="宝姑娘":
rword ="薛宝钗"elif word =="怡红公子"or word =="宝玉":
rword ="贾宝玉"elif word =="凤丫头"or word =="凤姐"or word =="凤哥儿"or word =="凤辣子":
rword ="王熙凤"elif word =="蓉大奶奶"or word =="可卿":
rword ="秦可卿"elif word =="湘云"or word =="史湘云":
reord ="史湘云"elif word =="四姑娘"or word =="惜春":
rword ="贾惜春"elif word =="凤丫头"or word =="凤姐":
rword ="王熙凤"elif word =="探春"or word =="三姑娘":
rword ="贾探春"else:
rword = word
counts[rword]= counts.get(rword,0)+1#计数for word in excludes:del counts[word]
items =list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)#排序for i inrange(20):#使用字典表达词频
word, count = items[i]print("{0:<10}{1:>5}".format(word, count))