朴素贝叶斯
(1)高斯朴素贝叶斯
import pandas as pd
skin = pd. read_excel( r'D:\\python 2019暑假\\朴素贝叶斯模型\\Skin_Segment.xlsx' )
skin. y = skin. y. map ( { 2 : 0 , 1 : 1 } )
skin. y. value_counts( )
0 194198
1 50859
Name: y, dtype: int64
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection. train_test_split( skin. iloc[ : , : 3 ] , skin. y,
test_size = 0.25 , random_state = 1234 )
from sklearn import naive_bayes
gnb = naive_bayes. GaussianNB( )
gnb. fit( X_train, y_train)
gnb_pred = gnb. predict( X_test)
pd. Series( gnb_pred) . value_counts( )
0 50630
1 10635
dtype: int64
from sklearn import metrics
import matplotlib. pyplot as plt
import seaborn as sns
cm = pd. crosstab( gnb_pred, y_test)
sns. heatmap( cm, annot = True , cmap = 'GnBu' , fmt = 'd' )
plt. xlabel( 'Real' )
plt. ylabel( 'Predict' )
plt. show( )
print ( '模型的准确率:\n' , metrics. accuracy_score( y_test, gnb_pred) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, gnb_pred) )
模型的准确率:
0.9229576430261976
模型的评估报告:
precision recall f1-score support
0 0.93 0.97 0.95 48522
1 0.88 0.73 0.80 12743
avg / total 0.92 0.92 0.92 61265
y_score = gnb. predict_proba( X_test) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test, y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color = 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve(area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )
(2)多项式朴素贝叶斯
a = open ( 'D:\\python 2019暑假\\朴素贝叶斯模型\\mushrooms.csv' )
mushrooms = pd. read_csv( a)
mushrooms. head( )
type cap_shape cap_surface cap_color bruises odor gill_attachment gill_spacing gill_size gill_color ... stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring stalk_color_below_ring veil_color ring_number ring_type spore_print_color population habitat 0 poisonous convex smooth brown yes pungent free close narrow black ... smooth smooth white white white one pendant black scattered urban 1 edible convex smooth yellow yes almond free close broad black ... smooth smooth white white white one pendant brown numerous grasses 2 edible bell smooth white yes anise free close broad brown ... smooth smooth white white white one pendant brown numerous meadows 3 poisonous convex scaly white yes pungent free close narrow brown ... smooth smooth white white white one pendant black scattered urban 4 edible convex smooth gray no none free crowded broad black ... smooth smooth white white white one evanescent brown abundant grasses
5 rows × 22 columns
columns = mushrooms. columns[ 1 : ]
for column in columns:
mushrooms[ column] = pd. factorize( mushrooms[ column] ) [ 0 ]
mushrooms. head( )
type cap_shape cap_surface cap_color bruises odor gill_attachment gill_spacing gill_size gill_color ... stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring stalk_color_below_ring veil_color ring_number ring_type spore_print_color population habitat 0 poisonous 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 1 edible 0 0 1 0 1 0 0 1 0 ... 0 0 0 0 0 0 0 1 1 1 2 edible 1 0 2 0 2 0 0 1 1 ... 0 0 0 0 0 0 0 1 1 2 3 poisonous 0 1 2 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0 4 edible 0 0 3 1 3 0 1 1 0 ... 0 0 0 0 0 0 1 1 2 1
5 rows × 22 columns
Predictors = mushrooms. columns[ 1 : ]
X_train, X_test, y_train, y_test = model_selection. train_test_split( mushrooms[ Predictors] ,
mushrooms[ 'type' ] , test_size= 0.25 ,
random_state = 10 )
from sklearn import naive_bayes
mnb = naive_bayes. MultinomialNB( )
mnb. fit( X_train, y_train)
mnb_pred = mnb. predict( X_test)
cm = pd. crosstab( mnb_pred, y_test)
sns. heatmap( cm, annot = True , cmap= 'GnBu' , fmt= 'd' )
plt. xlabel( '' )
plt. ylabel( '' )
plt. show( )
print ( '模型的准确率:\n' , metrics. accuracy_score( y_test, mnb_pred) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, mnb_pred) )
模型的准确率:
0.8700147710487445
模型的评估报告:
precision recall f1-score support
edible 0.85 0.92 0.88 1072
poisonous 0.90 0.82 0.86 959
avg / total 0.87 0.87 0.87 2031
y_score = mnb. predict_proba( X_test) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test. map ( { 'edible' : 0 , 'poisonous' : 1 } ) , y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw= 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color= 'red' , linestyle= '--' )
plt. text( 0.5 , 0.3 , 'ROC curve(area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specigicity' )
plt. ylabel( 'Sensitivity' )
plt. show( )
(3)伯努利贝叶斯
evaluation = pd. read_excel( r'D:\\python 2019暑假\\朴素贝叶斯模型\\Contents.xlsx' , sheetname= 0 )
evaluation. head( 10 )
NickName Date Content Type 0 AdJQKzNaZWAV 2016-04-14 23:30:42 想知道是不是卖家给我发错货了,怎么四个连接铁通的仅一个能连上,**块钱的东西说便宜也不至于廉... Negative 1 sdmjk 2013-06-24 22:37:51 垃圾。\n两个管两头一样粗,得自己用钳子摄细才能装上\n管子很软很细\n总的评价 - 就是两... Negative 2 f***n 2015-06-05 21:10:31 我就无语了...难弄到死..又没说明书..过段差评.. Negative 3 jd_817039867 2014-04-13 22:43:38 不满意,明明写的落地!结果差一截!而且自垂度不怎么好~还要用夹子夹!没有我在附近小超市买的质... Negative 4 jd_wscj529 2014-06-09 13:06:17 标的次日到达,结果快递用了四天,蚊帐杆底座太小,管壁太薄,而且蚊帐也没宣传那么垂地,此次购物... Negative 5 q***r 2017-04-25 00:24:25 真的很好,比超市卖的便宜多了,刚回来打算在买个给我嫂子家,发货也很快没想到这么便宜也能买的这... Positive 6 A***波 2017-05-11 06:53:47 最不满意的一次网购。直接看图。1.8的床两头1.8中间只有1.5了,两边的纱像少了一截,绷得... Negative 7 巴***住 2017-05-15 11:05:01 很不错的,就是拉丝不够好啊 Positive 8 j***w 2016-05-15 09:25:49 薄,漂亮 Positive 9 1***n 2017-05-15 09:00:43 味道有点大,线头有点多。装好后有点摇晃,拉链处开了几处,要不是等着急用,真心是要退 Positive
evaluation. Content = evaluation. Content. str . replace( '[0-9a-zA-Z]' , '' )
evaluation. head( )
NickName Date Content Type 0 AdJQKzNaZWAV 2016-04-14 23:30:42 想知道是不是卖家给我发错货了,怎么四个连接铁通的仅一个能连上,**块钱的东西说便宜也不至于廉... Negative 1 sdmjk 2013-06-24 22:37:51 垃圾。\n两个管两头一样粗,得自己用钳子摄细才能装上\n管子很软很细\n总的评价 - 就是两... Negative 2 f***n 2015-06-05 21:10:31 我就无语了...难弄到死..又没说明书..过段差评.. Negative 3 jd_817039867 2014-04-13 22:43:38 不满意,明明写的落地!结果差一截!而且自垂度不怎么好~还要用夹子夹!没有我在附近小超市买的质... Negative 4 jd_wscj529 2014-06-09 13:06:17 标的次日到达,结果快递用了四天,蚊帐杆底座太小,管壁太薄,而且蚊帐也没宣传那么垂地,此次购物... Negative
import jieba
jieba. load_userdict( r'D:\\python 2019暑假\\朴素贝叶斯模型\\all_words.txt' )
with open ( r'D:\\python 2019暑假\\朴素贝叶斯模型\\mystopwords.txt' , encoding= 'UTF-8' ) as words:
stop_words = [ i. strip( ) for i in words. readlines( ) ]
def cut_word ( sentence) :
words = [ i for i in jieba. lcut( sentence) if i not in stop_words]
result = ' ' . join( words)
return ( result)
words = evaluation. Content. apply ( cut_word)
words[ : 5 ]
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\袁博\AppData\Local\Temp\jieba.cache
Loading model cost 0.891 seconds.
Prefix dict has been built succesfully.
0 想 卖家 给我发 错货 四个 连接 铁通 块钱 便宜 廉价 退货
1 垃圾 \n 钳子 摄细 装 \n 管子 很软 \n 评价 垃圾
2 我就 无语 难弄 .. 说明书 .. 过段 差评 ..
3 不满意 写 落地 差一截 垂度 ~ 夹子 夹 没有 超市 买 质量好 换季 卖得 价钱 便宜
4 标的 到达 快递 四天 蚊帐 底座 太小 管壁 太薄 蚊帐 也没 宣传 垂地 购物 失败
Name: Content, dtype: object
from sklearn. feature_extraction. text import CountVectorizer
counts = CountVectorizer( min_df = 0.01 )
dtm_counts = counts. fit_transform( words) . toarray( )
columns = counts. get_feature_names( )
X = pd. DataFrame( dtm_counts, columns= columns)
y = evaluation. Type
X. head( )
一根 下单 不值 不好 不想 不满意 不知道 不行 不错 买回来 ... 还好 还行 退货 送货 速度 钢管 防蚊 非常好 颜色 麻烦 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 rows × 99 columns
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection. train_test_split( X, y, test_size = 0.25 ,
random_state= 1 )
from sklearn import naive_bayes
bnb = naive_bayes. BernoulliNB( )
bnb. fit( X_train, y_train)
bnb_pred = bnb. predict( X_test)
cm = pd. crosstab( bnb_pred, y_test)
sns. heatmap( cm, annot = True , cmap = 'GnBu' , fmt = 'd' )
plt. xlabel( 'Real' )
plt. ylabel( 'Predict' )
plt. show( )
y_score = bnb. predict_proba( X_test) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test. map ( { 'Negative' : 0 , 'Positive' : 1 } ) , y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )