import pandas as pd
skin = pd. read_excel( r'Skin_Segment.xlsx' )
skin. y = skin. y. map ( { 2 : 0 , 1 : 1 } )
skin. y. value_counts( )
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection. train_test_split( skin. iloc[ : , : 3 ] , skin. y,
test_size = 0.25 , random_state= 1234 )
from sklearn import naive_bayes
gnb = naive_bayes. GaussianNB( )
gnb. fit( X_train, y_train)
gnb_pred = gnb. predict( X_test)
pd. Series( gnb_pred) . value_counts( )
from sklearn import metrics
import matplotlib. pyplot as plt
import seaborn as sns
cm = pd. crosstab( gnb_pred, y_test)
sns. heatmap( cm, annot = True , cmap = 'GnBu' , fmt = 'd' )
plt. xlabel( 'Real' )
plt. ylabel( 'Predict' )
plt. show( )
print ( '模型的准确率为:\n' , metrics. accuracy_score( y_test, gnb_pred) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, gnb_pred) )
y_score = gnb. predict_proba( X_test) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test, y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )
import pandas as pd
mushrooms = pd. read_csv( r'mushrooms.csv' )
mushrooms. head( )
columns = mushrooms. columns[ 1 : ]
for column in columns:
mushrooms[ column] = pd. factorize( mushrooms[ column] ) [ 0 ]
mushrooms. head( )
from sklearn import model_selection
Predictors = mushrooms. columns[ 1 : ]
X_train, X_test, y_train, y_test = model_selection. train_test_split( mushrooms[ Predictors] , mushrooms[ 'type' ] ,
test_size = 0.25 , random_state = 10 )
from sklearn import naive_bayes
from sklearn import metrics
import seaborn as sns
import matplotlib. pyplot as plt
mnb = naive_bayes. MultinomialNB( )
mnb. fit( X_train, y_train)
mnb_pred = mnb. predict( X_test)
cm = pd. crosstab( mnb_pred, y_test)
sns. heatmap( cm, annot = True , cmap = 'GnBu' , fmt = 'd' )
plt. xlabel( 'Real' )
plt. ylabel( 'Predict' )
plt. show( )
print ( '模型的准确率为:\n' , metrics. accuracy_score( y_test, mnb_pred) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, mnb_pred) )
from sklearn import metrics
y_score = mnb. predict_proba( X_test) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test. map ( { 'edible' : 0 , 'poisonous' : 1 } ) , y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )
import pandas as pd
evaluation = pd. read_excel( r'Contents.xlsx' , sheetname= 0 )
evaluation. head( 10 )
evaluation. Content = evaluation. Content. str . replace( '[0-9a-zA-Z]' , '' )
evaluation. head( )
import jieba
jieba. load_userdict( r'all_words.txt' )
with open ( r'mystopwords.txt' , encoding= 'UTF-8' ) as words:
stop_words = [ i. strip( ) for i in words. readlines( ) ]
def cut_word ( sentence) :
words = [ i for i in jieba. lcut( sentence) if i not in stop_words]
result = ' ' . join( words)
return ( result)
words = evaluation. Content. apply ( cut_word)
words[ : 5 ]
from sklearn. feature_extraction. text import CountVectorizer
counts = CountVectorizer( min_df = 0.01 )
dtm_counts = counts. fit_transform( words) . toarray( )
columns = counts. get_feature_names( )
X = pd. DataFrame( dtm_counts, columns= columns)
y = evaluation. Type
X. head( )
from sklearn import model_selection
from sklearn import naive_bayes
from sklearn import metrics
import matplotlib. pyplot as plt
import seaborn as sns
X_train, X_test, y_train, y_test = model_selection. train_test_split( X, y, test_size = 0.25 , random_state= 1 )
bnb = naive_bayes. BernoulliNB( )
bnb. fit( X_train, y_train)
bnb_pred = bnb. predict( X_test)
cm = pd. crosstab( bnb_pred, y_test)
sns. heatmap( cm, annot = True , cmap = 'GnBu' , fmt = 'd' )
plt. xlabel( 'Real' )
plt. ylabel( 'Predict' )
plt. show( )
print ( '模型的准确率为:\n' , metrics. accuracy_score( y_test, bnb_pred) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, bnb_pred) )
y_score = bnb. predict_proba( X_test) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test. map ( { 'Negative' : 0 , 'Positive' : 1 } ) , y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )