目录
文档分类例子地址
https://github.com/cystanford/text_classification
计算文档中 tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
# stop_words - 停用词 ,token_pattern - 正则规则
# TfidfVectorizer(stop_words=stop_words, token_pattern=token_pattern)
tfidf_vec = TfidfVectorizer(stop_words=['is'])
documents = [
'this is the bayes document',
'this is the second second document',
'and the third one',
'is this the document'
]
tfidf_matrix = tfidf_vec.fit_transform(documents)
print('不重复的词:', tfidf_vec.get_feature_names())
print('每个单词的ID:', tfidf_vec.vocabulary_)
print('每个单词的tfidf值:', tfidf_