文本预处理——去停用词
停用词文本可以从https://pan.baidu.com/s/1q21hIK95QU9qDstptd8V8g 自提,不谢
该停用词文本转自https://blog.csdn.net/FontThrone/article/details/74200026,自己还未创建新的停用词,后续更新。。。。
# - * - coding: utf - 8 -*-
import sys
# 获取停用词的List
def GetListOfStopWords(filepath):
f_stop = open(filepath, encoding='utf-8')
try:
f_stop_text = f_stop.read()
finally:
f_stop.close()
f_stop_seg_list = f_stop_text.split('\n')
return f_stop_seg_list
# 保存List
# def SaveFile(list, filename):
# f_stop = open(filename, 'w', encoding='utf-8')
# for item in range(len(list)):
# if item != len(list):
# f_stop.writelines((list[item].encode('utf-8')) + '\n')
# else:
# f_stop.writelines(list[item].encode('utf-8'