文章地址:http://blog.163.com/datamining_123/blog/static/218037022201422435414115/
1.测试文本:
test.txt
4.执行结果如下:
2.测试文本内容:
this is just for test
这只是用来测试的
this is just for test
这只是用来测试的
3.代码及解释如下:
import jieba
def doc2matrix(doc):
x=open(doc,'r')
y=x.read()#读入所有内容
z1=jieba.cut(y)#对所有内容进行分词
z2=[]
for i in z1:
z2.append(i)#将分词结果变成list
token=u"';?,。,.!、()() "#标点符号
filter=[i for i in z2 if i not in token]#去除所有标点符号
d={}
for i in filter:
if d.has_key(i):
d[i]+=1
else:
d[i]=1
for keys,values in d.iteritems():
print keys+':'+str(values)
print '/'.join(filter)#输出分词结果
if __name__=='__main__':
doc2matrix(r'd:\desktop\test.txt')
>>> runfile('D:/desktop/untitled0.py', wdir=r'D:/desktop')
的:2
just:2
for:2
this:2
is:2
:3
这:2
只是:2
测试:2
test:2
用来:2
this/is/just/for/test/
/这/只是/用来/测试/的/
/this/is/just/for/test/
/这/只是/用来/测试/的
from collections import Counter c = Counter() with open('a.txt','r',encoding='utf-8') as f: for line in f.readlines(): words = line.split() c1 = Counter(words) c.update(c1)