比较坎坷,网上资料都没法完全满足自己的需求
# -*- coding:utf-8 -*-
import time
from collections import Counter
import re
import jieba
import jieba.analyse
txt = open('abc.txt').read()
result = Counter()
seg_list = jieba.cut(txt, cut_all = False)
re_gex = u'[\u4E00-\u9FA5]+'
for e in seg_list:
match_obj = re.search(re_gex,e)
if (match_obj and len(e)>1):
print e
result[e] = result[e] + 1
else:
print '<------------>'
#以词频倒序
print '*************************'
arr = result.most_common(10)#取出前十
for a in arr:
print 'key=%s,value=%s' %(a[0],a[1])
复制代码
结果: key=陈笑,value=65 key=叶铮,value=48 key=白衣,value=14 key=大将军,value=14 key=皇子,value=13 key=对方,value=13 key=有些,value=12 key=元帅,value=11 key=自己,value=11 key=这个,value=9