jieba库常用函数
文本字符统计实例
一 统计哈默雷特中出现次数最多的前10个词
#将文本归一化
def getText():
txt=open('hamlet.txt','r').read()
txt=txt.lower()
for cf in '!@、\";:,.()[]{}<>=-_*&^%$#`~/?|\':
txt=txt.replace(ch,'')
return txt
hanmletTxt=getText()
words=hamletTxt.spliit()
counts={}
for word in words:
counts[word] = counts.get(word,0)+1
items=list(count.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count =items[i]
print('{0:<10}{1:>5}'.format(word,count))