import jieba.posseg as pseg
import jieba
import jieba.analyse
import codecs
import math
设置待比较的几篇文章
files=[
'./sample/a1.txt',
'./sample/a2.txt',
'./sample/b1.txt',
'./sample/a3.txt',
]
texts=[codecs.open(file,'r','utf8').read() for file in files]
词频统计
from collections import Counter
defanalyse_count(text,n=20):
words=jieba.cut(text)
words=[each.strip() for each in words]
counter=Counter(words)
for a in counter.most_common(n):
print('%-10s\t%d' % (a[0],a[1]))
print('\n')
for i in range(len(texts)):
print("%s 词频统计\n======================"%files[i])
analyse_count(texts[i])