defgetText():
txt =open("hamlet.txt","r").read()#读取文件
txt = txt.lower()#把文本全部变为小写for ch in'|"#$%&^()*+,-./:;<>=?@[]\\_‘{}~':#把特殊字符变为空格
txt = txt.replace(ch," ")return txt
hamletText = getText()
words = hamletText.split()#把文件变为一个单词列表
counts ={}#定义字典for word in words:
counts[word]= counts.get(word,0)+1
items =list(counts.items())#把字典变为列表
items.sort(key=lambda x:x[1],reverse =True)#按照词频降序排列,要记住,常用for i inrange(10):#输出词频前10的单词
word, count = items[i]print("{0:<10}{1:>5}".format(word,count))
中文文本词频统计
import jieba
txt =open("threekingdoms.txt","r",encoding="utf-8").read()
words = jieba.lcut(txt)#利用jieba库进行分词
counts ={}for word in words:iflen(word)==1:continueelse:
counts[word]= counts.get(word,0)+1
items =list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)for i inrange(15):
word, count = items[i]print("{0:<10}{1:>5}".format(word,count))
import jieba
txt =open("threekingdoms.txt","r",encoding="utf-8").read()
excludes ={"将军","却说","荆州","二人","不可","不能","如此"}
words = jieba.lcut(txt)#利用jieba库进行分词
counts ={}for word in words:iflen(word)==1:continueelif word =="诸葛亮"or word =="孔明曰":
rword ="孔明"elif word =="关公"or word =="云长":
rword ="关羽"elif word =="玄德"or word =="玄德曰":
rword ="刘备"else:
reword = word
counts[rword]= counts.get(word,0)+1for word in excludes:del counts[word]
items =list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)for i inrange(15):
word, count = items[i]print("{0:<10}{1:>5}".format(word,count))