在白话大数据与机器学习一书,对照p222打例子:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors.nearest_centroid import NearestCentroid
from pprint import pprint
import sys
#读取数据
newsgroups_train = fetch_20newsgroups(subset='train')
pprint(list(newsgroups_train.target_names))
#随机选4个主题
categories = ['alt.atheism','comp.graphics','soc.religion.christian','sci.med']
#下载4个主题里的文件
train_data = fetch_20newsgroups(subset = "train", categories = categories)
#文件内容在train_data.data这个变量里。现在对内容进行分词和向量化操作
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_data.data)
#接着对向量化之后的结果做TF-IDF转换
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
#现在把TF-IDF转换后的结果和每条结果对应的主题编号train_data.target放入分类器中进行训练
clf = NearestCentroid().fit(train_tfidf, train_data.target)
#创建测试集合,这里有两条数据,每条数据一行内容,进行向量化和TF-IDF转换
docs_new = {'OpenGL onthe GPU is fast','God is love'}
docs_new_counts = count_vect.fit_transform(docs_new)
docs_new_tfidf = tfidf_transformer.fit_transform(docs_new_counts)
print(sys.modules[