一,对txt文件中出现的词语的频数统计再找出出现频率多的
二,代码:
import re
from collections import Counter
import jieba
def cut_word(datapath):
with open(datapath,'r',encoding='utf-8')as fp:
string = fp.read()
data = re.sub(r"[\s+\.\!\/_,$%^*(【】:\]\[\-:;+\"\']+|[+——!,。?、~@#¥%……&*()]+|[0-9]+", "", string)
word_list = jieba.cut(data)
print(type(word_list))
return word_list
def static_top_word(word_list,top=5):
result = dict(Counter(word_list))
print(result)
sortlist = sorted(result.items(),key=lambda x:x[1],reverse=True)
resultlist = []
for i in range(0,top):
resultlist.append(sortlist[i])
return resultlist
def main():
datapath = 'comment.txt'
word_list = cut_word(datapath)
Result = static_top_word(word_list)
print(Result)
main()
三,用正则对特殊符号过滤,用re.sub()对字符进行空字符替换