停用词删除
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def Delete_stopwords(example_sent):
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
return word_tokens,filtered_sentence
example_sent = "This is a sample sentence, showing off the stop words filtration."
word_tokens,filtered_sentence=Delete_stopwords(example_sent)
print(word_tokens,filtered_sentence)
停用词文件:
https://www.nltk.org/nltk_data/——73项