import jieba
#获得去除标点的文本
def get_text(file_name):
with open(file_name, 'r', encoding='utf-8') as fr:
text = fr.read()
#要删除的标点
del_ch = ['《',',','》','\n','。','、',';','"',\
':',',','!','?',' ']
for ch in del_ch:
text = text.replace(ch,'')#这里无需替换成空格
return text
#文件名改为要分析的文件
file_name = 'xxx'
text = get_text(file_name)
vlist = jieba.lcut(text)#调用jieba实现分词,返回列表
res_dict = {}
#进行词频统计
for i in vlist:
res_dict[i] = res_dict.get(i,0) + 1
res_list = list(res_dict.items())
#降序排序
res_list.sort(key = lambda x:x[1], reverse = True)
fin_res_list = []
#去除单个字的词
for item in res_list:
if(len(item[0])>=2):
fin_res_list.append(item)
for i in range(50):
word,count = fin_res_list[i]
pstr = str(i+1) + ':'
print(word,count)