import re
import pathlib
#获取目录
script_path = pathlib.PurePath(__file__).parent
text_path = pathlib.Path(script_path).joinpath("text")
#定义函数,处理文本
def parse_text(filename):
with open(filename,encoding='utf-8') as fs:
content = fs.read()
#去除标点符号和换行符
text = re.sub(r'[^\w]'," ",content)
#转化为小写
text = text.lower()
#转化为列表
word_list = text.split()
#去除空白单词
word_list = list(filter(None,word_list))
#生成词典,键为单词,值为单次出现的次数
word_dict = {}
for word in word_list:
if word not in word_dict:
word_dict[word]=0
word_dict[word]+=1
#按照词频排序
word_total = sorted(word_dict.items(),key=lambda x:x[1])
return word_total