立即学习:https://edu.csdn.net/course/play/9460/199586?utm_source=blogtoedu
# 清除不在词汇表中的词语
def clear_word_from_vocab(word_list,vocab):
new_word_list = []
for word in word_list:
if word in vocab:
new_word_list.append(word)
return new_word_list
# 文本预处理
def preprocessing_test_pd(text_dir, after_process_text_dir,stop_words):
stop_words = get_stop_words(stop_words_dir)
sentences = []
df = pd.read_csv(text_dir)
for index, row in df.iterrows():
print(index)
title = delete_r_n(row['title'])
word_list = jieba_cut(title,stop_words)
df.loc[index, 'title'] = " ".join(word_list)
sentences.append(word_list)
df.to_csv(after_process_text_dir,encoding=GrobalParament.encoding)
return sentences