剔除高频停用词减少模型噪音,并加速训练
def remove_fre_stop_word(words):
t = 1e-5 # t 值
threshold = 0.8 # 剔除概率阈值
# 统计单词频率
int_word_counts = collections.Counter(words)
total_count = len(words)
# 计算单词频率
word_freqs = {w: c / total_count for w, c in int_word_counts.items()}
# 计算被删除的概率
prob_drop = {w: 1 - np.sqrt(t / f) for w, f in word_freqs.items()}
# 对单词进行采样
train_words = [w for w in words if prob_drop[w] < threshold]
return train_words