机器学习实验:使用3种朴素贝叶斯算法对iris()、breast_cancer()和wine()数据集进行训练
实验内容:
分别使用3种朴素贝叶斯算法对iris()、breast_cancer()和wine()数据集进行训练,并通过比较其分类效果分析不同算法对不同数据集的适用情况。
实现代码:
-
导入相关库
import pandas as pd import numpy as np import sklearn.metrics as metrics from sklearn.preprocessing import label_binarize from sklearn.datasets import load_iris from sklearn.datasets import load_wine from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split import sklearn.naive_bayes as bayes
-
导入数据集
iris = load_iris() #iris数据集 cancer = load_breast_cancer() #乳腺癌数据集 wine = load_wine() #红酒数据集
-
对iris数据集进行训练
#对iris数据集处理 #1.处理数据集 X_iris=iris.data Y_iris=iris.target train_x_iris,test_x_iris,train_y_iris,test_y_iris=train_test_split(X_iris,Y_iris,test_size=0.3,random_state=2) # print(train_x_iris.shape) #(105, 4) # print(test_x_iris.shape) #(45, 4) #2.获取朴素贝叶斯分类器的高斯模型,并训练 gaussian = bayes.GaussianNB() bernoulli = bayes.BernoulliNB() multionmial =bayes.MultinomialNB() gaussian.fit(train_x_iris,train_y_iris) bernoulli.fit(train_x_iris,train_y_iris) multionmial.fit(train_x_iris,train_y_iris) #输入数据出现负值,不能使用MultinomialNB #3.进行预测 test_y_gaussianHat_iris = gaussian.predict(test_x_iris) test_y_bernoulliHat_iris = bernoulli.predict(test_x_iris) test_y_multionmialHat_iris = multionmial.predict(test_x_iris) #4.输出准确率 score_gaussian = gaussian.score(test_x_iris,test_y_iris) score_bernoulli = bernoulli.score(test_x_iris,test_y_iris) score_multionmial = multionmial.score(test_x_iris,test_y_iris) print("gaussian score:"+str(score_gaussian)) print("bernoulli score:"+str(score_bernoulli)) print("multionmial score:"+str(score_multionmial)) # gaussian score:0.9777777777777777 # bernoulli score:0.28888888888888886 # multionmial score:0.9111111111111111
-
对breast_cancer数据集训练
#对breast_cancer数据集处理 #1.处理数据集 X_cancer=cancer.data Y_cancer=cancer.target train_x_cancer,test_x_cancer,train_y_cancer,test_y_cancer=train_test_split(X_cancer,Y_cancer,test_size=0.3,random_state=2) # print(train_x_cancer.shape) #(398,30) # print(test_x_cancer.shape) #(171,30) #2.获取朴素贝叶斯分类器的高斯模型,并训练 gaussian = bayes.GaussianNB() bernoulli = bayes.BernoulliNB() multionmial =bayes.MultinomialNB() gaussian.fit(train_x_cancer,train_y_cancer) bernoulli.fit(train_x_cancer,train_y_cancer) multionmial.fit(train_x_cancer,train_y_cancer) #输入数据出现负值,不能使用MultinomialNB #3.进行预测 test_y_gaussianHat_cancer = gaussian.predict(test_x_cancer) test_y_bernoulliHat_cancer = bernoulli.predict(test_x_cancer) test_y_multionmialHat_cancer = multionmial.predict(test_x_cancer) #4.输出准确率 score_gaussian = gaussian.score(test_x_cancer,test_y_cancer) score_bernoulli = bernoulli.score(test_x_cancer,test_y_cancer) score_multionmial = multionmial.score(test_x_cancer,test_y_cancer) print("gaussian score:"+str(score_gaussian)) print("bernoulli score:"+str(score_bernoulli)) print("multionmial score:"+str(score_multionmial)) # gaussian score:0.9415204678362573 # bernoulli score:0.6081871345029239 # multionmial score:0.8888888888888888
-
对breast_wine数据集训练
#对breast_wine数据集处理 #1.处理数据集 X_wine=wine.data Y_wine=wine.target train_x_wine,test_x_wine,train_y_wine,test_y_wine=train_test_split(X_wine,Y_wine,test_size=0.3,random_state=2) # print(train_x_wine.shape) #(124, 13) # print(test_x_wine.shape) #(54,13) #2.获取朴素贝叶斯分类器的高斯模型,并训练 gaussian = bayes.GaussianNB() bernoulli = bayes.BernoulliNB() multionmial =bayes.MultinomialNB() gaussian.fit(train_x_wine,train_y_wine) bernoulli.fit(train_x_wine,train_y_wine) multionmial.fit(train_x_wine,train_y_wine) #输入数据出现负值,不能使用MultinomialNB #3.进行预测 test_y_gaussianHat_wine = gaussian.predict(test_x_wine) test_y_bernoulliHat_wine = bernoulli.predict(test_x_wine) test_y_multionmialHat_wine = multionmial.predict(test_x_wine) #4.输出准确率 score_gaussian = gaussian.score(test_x_wine,test_y_wine) score_bernoulli = bernoulli.score(test_x_wine,test_y_wine) score_multionmial = multionmial.score(test_x_wine,test_y_wine) print("gaussian score:"+str(score_gaussian)) print("bernoulli score:"+str(score_bernoulli)) print("multionmial score:"+str(score_multionmial)) # gaussian score:0.9629629629629629 # bernoulli score:0.37037037037037035 # multionmial score:0.8888888888888888
总结: sklearn提供多个朴素贝叶斯分类器,他们的主要区别在于假设了不同的概率分布,这三种常用的朴素贝叶斯分类器区别如下:
- 高斯朴素贝叶斯分类器(GaussianNB):适合于连续值
- 多项式贝叶斯分类器(MultimomialNB):适合于离散值(如:计数)
- 伯努利贝叶斯分类器(BernoulliNB):适合于二值
不同朴素贝叶斯模型的区别,主要在于它们对概率分布𝑃(𝑋i|C)所做的不同假设。