答:
之前的部分代码如下。可以输出stopwords,
from sklearn.dummy import DummyClassifier
from sklearn import preprocessing
from nltk.stem.snowball import SnowballStemmer
from Chapter01.tokenization import tokenize_nltk
stemmer = SnowballStemmer('english')
bbc_dataset = "Chapter04/bbc-text.csv"
stopwords_file_path = "Chapter01/stopwords.csv"
stopwords = []
def read_in_csv(csv_file):
with open(csv_file, 'r', encoding='utf-8') as fp:
reader = csv.reader(fp, delimiter=',', quotechar='"')
data_read = [row for row in reader]
return data_read
def tokenize_and_stem(sentence):
tokens = nltk.word_tokenize(sentence)
filtered_tokens = [t for t in tokens if t not in string.punctuation]
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def get_stopwords(path=stopwords_file_path):
stopwords = read_in_csv(path)
stopwords = [word[0] for word in stopwords]
stemmed_stopwords = [stemmer.stem(word) for word in stopwords]
stopwords = stopwords + stemmed_stopwords
return stopwords
stopwords = get_stopwords(stopwords_file_path)
def get_data(filename):
data = read_in_csv(filename)
data_dict = {}
for row in data[1:]:
category = row[0]
text = row[1]
if (category not in data_dict.keys()):
data_dict[category] = []
data_dict[category].append(text)
return data_dict
```
”