1.高斯朴素贝叶斯对鸢尾花数据进行分类
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import matplotlib as mpl
from sklearn. preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn. naive_bayes import GaussianNB, MultinomialNB
from sklearn. pipeline import Pipeline
from sklearn. metrics import accuracy_score
from sklearn. model_selection import train_test_split
from sklearn. neighbors import KNeighborsClassifier
if __name__ == "__main__" :
data = pd. read_csv( 'iris.data' , header= None )
x, y = data[ np. arange( 4 ) ] , data[ 4 ]
y = pd. Categorical( values= y) . codes
print ( y)
feature_names = u'花萼长度' , u'花萼宽度' , u'花瓣长度' , u'花瓣宽度'
features = [ 0 , 1 ]
x = x[ features]
x, x_test, y, y_test = train_test_split( x, y, train_size= 0.7 , random_state= 0 )
priors = np. array( ( 1 , 2 , 4 ) , dtype= float )
priors /= priors. sum ( )
gnb = Pipeline( [
( 'sc' , StandardScaler( ) ) ,
( 'poly' , PolynomialFeatures( degree= 1 ) ) ,
( 'clf' , GaussianNB( priors= priors) ) ] )
gnb. fit( x, y. ravel( ) )
y_hat = gnb. predict( x)
print ( '训练集准确度: %.2f%%' % ( 100 * accuracy_score( y, y_hat) ) )
y_test_hat = gnb. predict( x_test)
print ( '测试集准确度:%.2f%%' % ( 100 * accuracy_score( y_test, y_test_hat) ) )
N, M = 500 , 500
x1_min, x2_min = x. min ( )
x1_max, x2_max = x. max ( )
t1 = np. linspace( x1_min, x1_max, N)
t2 = np. linspace( x2_min, x2_max, M)
x1, x2 = np. meshgrid( t1, t2)
x_grid = np. stack( ( x1. flat, x2. flat) , axis= 1 )
mpl. rcParams[ 'font.sans-serif' ] = [ u'simHei' ]
mpl. rcParams[ 'axes.unicode_minus' ] = False
cm_light = mpl. colors. ListedColormap( [ '#77E0A0' , '#FF8080' , '#A0A0FF' ] )
cm_dark = mpl. colors. ListedColormap( [ 'g' , 'r' , 'b' ] )
y_grid_hat = gnb. predict( x_grid)
y_grid_hat = y_grid_hat. reshape( x1. shape)
plt. figure( facecolor= 'w' )
plt. pcolormesh( x1, x2, y_grid_hat, cmap= cm_light)
plt. scatter( x[ features[ 0 ] ] , x[ features[ 1 ] ] , c= y, edgecolors= 'k' , s= 50 , cmap= cm_dark)
plt. scatter( x_test[ features[ 0 ] ] , x_test[ features[ 1 ] ] , c= y_test, marker= '^' , edgecolors= 'k' , s= 120 , cmap= cm_dark)
plt. xlabel( feature_names[ features[ 0 ] ] , fontsize= 13 )
plt. ylabel( feature_names[ features[ 1 ] ] , fontsize= 13 )
plt. xlim( x1_min, x1_max)
plt. ylim( x2_min, x2_max)
plt. title( u'GaussianNB对鸢尾花数据的分类结果' , fontsize= 18 )
plt. grid( True )
plt. show( )
[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 ]
训练集准确度: 82.86 %
测试集准确度:71.11 %
2.文本数据的处理流程-20个类别的新闻组数据
import numpy as np
from sklearn. naive_bayes import MultinomialNB, BernoulliNB
from sklearn. datasets import fetch_20newsgroups
from sklearn. feature_extraction. text import TfidfVectorizer
from sklearn. linear_model import RidgeClassifier
from sklearn. neighbors import KNeighborsClassifier
from sklearn. svm import SVC
from sklearn. ensemble import RandomForestClassifier
from sklearn. model_selection import GridSearchCV
from sklearn import metrics
from time import time
from pprint import pprint
import matplotlib. pyplot as plt
import matplotlib as mpl
def test_clf ( clf) :
print ( u'分类器:' , clf)
alpha_can = np. logspace( - 3 , 2 , 10 )
model = GridSearchCV( clf, param_grid= { 'alpha' : alpha_can} , cv= 5 )
m = alpha_can. size
if hasattr ( clf, 'alpha' ) :
'''判断对象中是否具有该属性'''
model. set_params( param_grid= { 'alpha' : alpha_can} )
m = alpha_can. size
if hasattr ( clf, 'n_neighbors' ) :
neighbors_can = np. arange( 1 , 15 )
model. set_params( param_grid= { 'n_neighbors' : neighbors_can} )
m = neighbors_can. size
if hasattr ( clf, 'C' ) :
C_can = np. logspace( 1 , 3 , 3 )
gamma_can = np. logspace( - 3 , 0 , 3 )
model. set_params( param_grid= { 'C' : C_can, 'gamma' : gamma_can} )
m = C_can. size * gamma_can. size
if hasattr ( clf, 'max_depth' ) :
max_depth_can = np. arange( 4 , 10 )
model. set_params( param_grid= { 'max_depth' : max_depth_can} )
m = max_depth_can. size
t_start = time( )
model. fit( x_train, y_train)
t_end = time( )
t_train = ( t_end - t_start) / ( 5 * m)
print ( u'5折交叉验证的训练时间为:%.3f秒/(5*%d)=%.3f秒' % ( ( t_end - t_start) , m, t_train) )
print ( u'最优超参数为:' , model. best_params_)
t_start = time( )
y_hat = model. predict( x_test)
t_end = time( )
t_test = t_end - t_start
print ( u'测试时间:%.3f秒' % t_test)
acc = metrics. accuracy_score( y_test, y_hat)
print ( u'测试集准确率:%.2f%%' % ( 100 * acc) )
name = str ( clf) . split( '(' ) [ 0 ]
index = name. find( 'Classifier' )
if index != - 1 :
name = name[ : index]
if name == 'SVC' :
name = 'SVM'
return t_train, t_test, 1 - acc, name
if __name__ == "__main__" :
print ( u'开始下载/加载数据...' )
t_start = time( )
remove = ( )
categories = 'alt.atheism' , 'talk.religion.misc' , 'comp.graphics' , 'sci.space'
data_train = fetch_20newsgroups( subset= 'train' , categories= categories, shuffle= True , random_state= 0 , remove= remove)
data_test = fetch_20newsgroups( subset= 'test' , categories= categories, shuffle= True , random_state= 0 , remove= remove)
t_end = time( )
print ( u'下载/加载数据完成,耗时%.3f秒' % ( t_end - t_start) )
print ( u'数据类型:' , type ( data_train) )
print ( u'训练集包含的文本数目:' , len ( data_train. data) )
print ( u'测试集包含的文本数目:' , len ( data_test. data) )
print ( u'训练集和测试集使用的%d个类别的名称:' % len ( categories) )
categories = data_train. target_names
pprint( categories)
y_train = data_train. target
y_test = data_test. target
print ( u' -- 前10个文本 -- ' )
for i in np. arange( 10 ) :
print ( u'文本%d(属于类别 - %s):' % ( i+ 1 , categories[ y_train[ i] ] ) )
print ( data_train. data[ i] )
print ( '\n\n' )
vectorizer = TfidfVectorizer( input = 'content' , stop_words= 'english' , max_df= 0.5 , sublinear_tf= True )
x_train = vectorizer. fit_transform( data_train. data)
x_test = vectorizer. transform( data_test. data)
print ( u'训练集样本个数:%d,特征个数:%d' % x_train. shape)
print ( u'停止词:\n' )
pprint( vectorizer. get_stop_words( ) )
feature_names = np. asarray( vectorizer. get_feature_names( ) )
print ( u'\n\n===================\n分类器的比较:\n' )
clfs = ( MultinomialNB( ) ,
BernoulliNB( ) ,
KNeighborsClassifier( ) ,
RidgeClassifier( ) ,
RandomForestClassifier( n_estimators= 200 ) ,
SVC( )
)
result = [ ]
for clf in clfs:
a = test_clf( clf)
result. append( a)
print ( '\n' )
result = np. array( result)
time_train, time_test, err, names = result. T
time_train = time_train. astype( np. float )
time_test = time_test. astype( np. float )
err = err. astype( np. float )
x = np. arange( len ( time_train) )
mpl. rcParams[ 'font.sans-serif' ] = [ u'simHei' ]
mpl. rcParams[ 'axes.unicode_minus' ] = False
plt. figure( figsize= ( 10 , 7 ) , facecolor= 'w' )
ax = plt. axes( )
b1 = ax. bar( x, err, width= 0.25 , color= '#77E0A0' )
ax_t = ax. twinx( )
b2 = ax_t. bar( x+ 0.25 , time_train, width= 0.25 , color= '#FFA0A0' )
b3 = ax_t. bar( x+ 0.5 , time_test, width= 0.25 , color= '#FF8080' )
plt. xticks( x+ 0.5 , names)
plt. legend( [ b1[ 0 ] , b2[ 0 ] , b3[ 0 ] ] , ( u'错误率' , u'训练时间' , u'测试时间' ) , loc= 'upper left' , shadow= True )
plt. title( u'新闻组文本数据不同分类器间的比较' , fontsize= 18 )
plt. xlabel( u'分类器名称' )
plt. grid( True )
plt. tight_layout( 2 )
plt. show( )
分类器的比较:
分类器: MultinomialNB( alpha= 1.0 , class_prior= None , fit_prior= True )
5 折交叉验证的训练时间为:0.540 秒/ ( 5 * 10 ) = 0.011 秒
最优超参数为: { 'alpha' : 0.003593813663804626 }
测试时间:0.004 秒
测试集准确率:89.58 %
分类器: BernoulliNB( alpha= 1.0 , binarize= 0.0 , class_prior= None , fit_prior= True )
5 折交叉验证的训练时间为:0.862 秒/ ( 5 * 10 ) = 0.017 秒
最优超参数为: { 'alpha' : 0.001 }
测试时间:0.009 秒
测试集准确率:88.54 %
分类器: KNeighborsClassifier( algorithm= 'auto' , leaf_size= 30 , metric= 'minkowski' ,
metric_params= None , n_jobs= None , n_neighbors= 5 , p= 2 ,
weights= 'uniform' )
5 折交叉验证的训练时间为:4.213 秒/ ( 5 * 14 ) = 0.060 秒
最优超参数为: { 'n_neighbors' : 3 }
测试时间:0.193 秒
测试集准确率:86.03 %
分类器: RidgeClassifier( alpha= 1.0 , class_weight= None , copy_X= True , fit_intercept= True ,
max_iter= None , normalize= False , random_state= None ,
solver= 'auto' , tol= 0.001 )
5 折交叉验证的训练时间为:7.214 秒/ ( 5 * 10 ) = 0.144 秒
最优超参数为: { 'alpha' : 0.001 }
测试时间:0.002 秒
测试集准确率:89.28 %
分类器: RandomForestClassifier( bootstrap= True , ccp_alpha= 0.0 , class_weight= None ,
criterion= 'gini' , max_depth= None , max_features= 'auto' ,
max_leaf_nodes= None , max_samples= None ,
min_impurity_decrease= 0.0 , min_impurity_split= None ,
min_samples_leaf= 1 , min_samples_split= 2 ,
min_weight_fraction_leaf= 0.0 , n_estimators= 200 ,
n_jobs= None , oob_score= False , random_state= None ,
verbose= 0 , warm_start= False )
5 折交叉验证的训练时间为:27.850 秒/ ( 5 * 6 ) = 0.928 秒
最优超参数为: { 'max_depth' : 9 }
测试时间:0.147 秒
测试集准确率:77.16 %
分类器: SVC( C= 1.0 , break_ties= False , cache_size= 200 , class_weight= None , coef0= 0.0 ,
decision_function_shape= 'ovr' , degree= 3 , gamma= 'scale' , kernel= 'rbf' ,
max_iter= - 1 , probability= False , random_state= None , shrinking= True ,
tol= 0.001 , verbose= False )
5 折交叉验证的训练时间为:170.046 秒/ ( 5 * 9 ) = 3.779 秒
最优超参数为: { 'C' : 100.0 , 'gamma' : 0.03162277660168379 }
测试时间:1.795 秒
测试集准确率:90.10 %