#--coding:utf-8--
#code by myhaspl
from __future__ import unicode_literals
from __future__ import division
import nltk
import sys
sys.path.append("../")
import jieba
def cutstring(txt):
#分词http://blog.csdn.net/myhaspl
cutstr = jieba.cut(txt)
result=" ".join(cutstr)
return result
#读取文件http://blog.csdn.net/myhaspl
txtfileobject = open('test2.txt','r')
try:
filestr = txtfileobject.read( )
finally:
txtfileobject.close( )
cutstr=cutstring(filestr)
tokenstr=nltk.word_tokenize(cutstr)
fdist=nltk.FreqDist(tokenstr)
#以词长为元素,计算不同词长的频率 http://blog.csdn.net/myhaspl
print "----词频-----"
fdist1=nltk.FreqDist([len(w) for w in tokenstr])
for w,c in fdist1.items():
print w,"=>",c,"||",
#词长http://blog.csdn.net/myhaspl
print
print "----词长-----"
print fdist1.keys()
#词http://blog.csdn.net/myhaspl
print
print "---词频---"
fdist2=nltk.FreqDist(tokenstr)
for w,c in fdist2.items():
print w,"=>",c,"||",
本博客所有内容是原创,如果转载请注明来源
http://blog.csdn.net/myhaspl/
----词频-----
1 => 750 || 2 => 864 || 3 => 80 || 4 => 28 || 5 => 2 || 6 => 1 ||
----词长-----
[1, 2, 3, 4, 5, 6]
---词频---
要 => 2 || 大脑皮层 => 2 || 一切 => 3 || 无意识 => 1 || 加快 => 1 || 一方面 => 1 || 通过 => 2 || 特性 => 1 || 电视观众 => 1 || 窗 => 1 || 圣哲 => 1 || 会 => 16 || 神经科学 => 1 || 被 => 3 ||