参考博客:https://blog.csdn.net/brucewong0516/article/details/79055480
本地有一个停用词表eng_stop_words.txt文本,现在使用pickle打包:
def savedb(pre_file, filename):
with open(pre_file, 'r') as f:
content = [line.strip() for line in f.readlines()]
pickle.dump(content, open(filename, 'wb'))
savedb('eng_stop_words.txt', 'eng_stop_words.pkl')
使用时:
def drop_stopwords(contents, stopwords):
contents_clean = []
for line in contents:
line_clean = []
for word in line:
if word in stopwords:
continue
line_clean.append(word)
contents_clean.extend(line_clean)
return contents_clean
stopwords = pickle.load(open('eng_stop_words.pkl', 'rb')) #读取停用词文件
content_list = drop_stopwords(content, stopwords)