defgetText():
txt =open("Hamlet.txt","r").read()#打开文件
txt = txt.lower()#全部转为小写for ch in'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~':#将所有特殊字符转化为空格
txt = txt.replace(ch," ")return txt
hamletTxt = getText()
words = hamletTxt.split()#切割字符串
counts ={}for words in words:
counts[words]= counts.get(words,0)+1
items =list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)for i inrange(10):
words,counts = items[i]print("{0:<10}{1:>5}".format(words,counts))
结果:
示例2:找出三国演义中出场次序最多的十个人物
import jieba
txt =open("三国演义.txt","r",encoding="utf-8").read()#读取文件
excludes ={"将军","却说","二人","不可","荆州","不能","如此","商议","如何","军士","主公","左右","军马",\
"引兵","次日","大喜","天下","东吴","于是","今日","不敢","魏兵","陛下","一人","都督", \
"人马","不知"}
words = jieba.lcut(txt)
counts ={}for word in words:iflen(word)==1:continueelif word =="诸葛亮"or word =="孔明曰":
rword ="孔明"elif word =="关公"or word =="云长":
rword ="关羽"elif word =="玄德"or word =="玄德曰":
rword ="刘备"elif word =="孟德"or word =="丞相":
rword ="曹操"else:
rword = word
counts[rword]= counts.get(rword,0)+1for word in excludes:del counts[word]
items =list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)for i inrange(10):
word, count = items[i]print("{0:<10}{1:>5}".format(word, count))