from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
#1.使用为去掉停用词的DictVectorizer对20newsgroup进行分类
news = fetch_20newsgroups(subset='all')
#对数据进行分割
X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)
#采用默认配置对CountVectorizer进行初始化
count_vec = CountVectorizer()
#使用词频统计的方式将原始数据和测试文本转化为特征向量
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)
#使用默认的配置对分类器进行初始化(朴素贝叶斯分类器)
mnb_count=MultinomialNB()
#使用分类器对不去停用词的训练样本的参数进行学习
mnb_count.
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
#1.使用为去掉停用词的DictVectorizer对20newsgroup进行分类
news = fetch_20newsgroups(subset='all')
#对数据进行分割
X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)
#采用默认配置对CountVectorizer进行初始化
count_vec = CountVectorizer()
#使用词频统计的方式将原始数据和测试文本转化为特征向量
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)
#使用默认的配置对分类器进行初始化(朴素贝叶斯分类器)
mnb_count=MultinomialNB()
#使用分类器对不去停用词的训练样本的参数进行学习
mnb_count.