#对象为以字典结构进行存储的数据,对特征进行抽取和向量化
#定义一组字典列表,用来表示多个数据样本(每个字典代表一个数据样本)
measurements=[{'city':'Dubai','temperature':33.},{'city':'London','temperature':12.},{'city':'San Fransisco','temperature':18.}]
from sklearn.feature_extraction import DictVectorizer
vec=DictVectorizer()
#输出转化之后的特征矩阵
print(vec.fit_transform(measurements).toarray())
#输出各个维度的特征含义
print(vec.get_feature_names())
#对象为文本特征
#使用CountVectorizer并且不去掉停用词的条件下,对文本特征进行量化的朴素贝叶斯分类性能测试
from sklearn.datasets import fetch_20newsgroups
#从互联网即时下载新闻样本,
#subset='all'代表下载全部近2万条文本存储在变量news中
news=fetch_20newsgroups(subset='all')
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer()
#只使用词频统计的方式将原始训练和测试文本转化为特征向量
X_count_train=count_vec.fit_transform(X_train)
X_count_test=count_vec.transform(X_test)
#朴素贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB
mnb_count=MultinomialNB()
#对特征向量(不去除停用词)进行参数学习
mnb_count.fit(X_count_train,y_train)
#输出模型准确性结果
print('新闻利用贝叶斯分类器(不去除停用词):',mnb_count.score(X_count_test,y_test))
y_count_predict=mnb_count.predict(X_count_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_count_predict,target_names=news.target_names))
#利用TfidfVectorizer,不去掉停用词的条件下
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec=TfidfVectorizer()
X_tfidf_train=tfidf_vec.fit_transform(X_train)
X_tfidf_test=tfidf_vec.transform(X_test)
mnb_tfidf=MultinomialNB()
mnb_tfidf.fit(X_tfidf_train,y_train)
print('新闻利用贝叶斯分类器(不去除停用词):',mnb_tfidf.score(X_tfidf_test,y_test))
y_tfidf_predict=mnb_tfidf.predict(X_tfidf_test)
print(classification_report(y_test,y_tfidf_predict,target_names=news.target_names))
#性能得到提升,说明训练文本量较多时,利用TfidfVectorizer压制这些常用词汇对分类决策的干扰
#可以提升模型性能
#将通用词在文本特种抽取中以黑名单的方式过滤掉,以提高模型的性能
#使用停用词过滤配置初始化CountVectorizer 与 TfidfVectorizer
count_filter_vec,tfidf_filter_vec=CountVectorizer(analyzer='word',stop_words='english'),TfidfVectorizer(analyzer='word',stop_words='english')
X_count_filter_train=count_filter_vec.fit_transform(X_train)
X_count_filter_test=count_filter_vec.transform(X_test)
X_tfidf_filter_train=tfidf_filter_vec.fit_transform(X_train)
X_tfidf_filter_test=tfidf_filter_vec.transform(X_test)
mnb_count_filter=MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
print('新闻利用贝叶斯分类器(去除停用词):',mnb_count_filter.score(X_count_filter_test,y_test))
y_count_filter_predict=mnb_count_filter.predict(X_count_filter_test)
mnb_tfidf_filter=MultinomialNB()
mnb_tfidf_filter.fit(X_tfidf_filter_train,y_train)
print('新闻利用贝叶斯分类器(去除停用词):',mnb_tfidf_filter.score(X_tfidf_filter_test,y_test))
y_tfidf_filter_predict=mnb_tfidf_filter.predict(X_tfidf_filter_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_count_filter_predict,target_names=news.target_names))
print(classification_report(y_test,y_tfidf_filter_predict,target_names=news.target_names))
#对停用词进行过滤的文本特征抽取方法,平均要比不过滤停用词的模型综合性能高出3——4