我在这里阅读了所有相关问题,但找不到可行的解决方案:
我的分类器创建:
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: english_stemmer.stemWords(analyzer(doc))
tf = StemmedTfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df = 0, max_features=200000, stop_words = 'english')
def create_tfidf(f):
docs = []
targets = []
with open(f, "r") as sentences_file:
reader = csv.reader(sentences_file, delimiter=';')
reader.next()
for row in reader:
docs.append(row[1])
targets.append(row[0])
tfidf_matrix = tf.fit_transform(docs)
print tfidf_matrix.shape
# print tf.get_feature_names()
return tfidf_matrix, targets
X,y = cre