对酒瓶颜色（wine_data.xlsx）按照三原色的三个特征值来进行分类，数据类别分为A、B、C、D四类。采用python实现基于正态分布最小错误贝叶斯分类器算法，输出判别结果和正确率。

本文链接：https://blog.csdn.net/qq_60943902/article/details/128484191

写在前面：

对于此题，下面提供了三种写法，都是基于最小错误贝叶斯决策来实现的，并且对于wine_data的预测准确率都是100%。注释也已经写好，需要数据集的人可以点击这里获取：链接：

wine_data数据集链接
提取码：ciba

方法一：通过sklearn库中GaussianNB实现

运行结果如下：

图表 1判别结果

图表 2正确率

运行代码如下：

from matplotlib import pyplot as plt

from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split

from sklearn import metrics

import pandas as pd

#导入数据

data_all = pd.read_excel('wine_data.xlsx')

# split the train data and test data

data = data_all.iloc[:,1:4]

target = data_all.iloc[:,4]

# #拆分训练集和测试集

train_size = len(data[0:29])/len(data)

x_train,x_test,y_train,y_test = train_test_split(data,target,train_size=train_size,shuffle=False)

#高斯模型

clf_G2 = GaussianNB().fit(x_train,y_train)

#给出测试集的预测类别输出

y_pred = clf_G2.predict(x_test)

y_pred = pd.DataFrame(y_pred)

#给出测试集样本在各个类别上预测的概率

y_pred_proba = clf_G2.predict_proba(x_test)

#给出测试集样本在各个类别上预测概率的对数转化

y_pred_log_proba = clf_G2.predict_log_proba(x_test)

print(y_pred)

print(y_test)

data_all.iloc[29:,5] = y_pred.iloc[:]



# plt.plot(y_test)

# print(y_pred_proba[:5])

print(y_pred_log_proba[:5])

print("训练集评分：",clf_G2.score(x_train,y_train))

print("测试集评分：",clf_G2.score(x_test,y_test))

print("精准率：",metrics.accuracy_score(y_test,y_pred))

data_all.to_excel('result_new1.xlsx',index = None)

print(data_all)

方法二：通过基于正太分布的最小错误率分类

运行结果：

图表 3判别结果

图表 4正确率

代码如下：

import pandas as pd

import numpy as np

import math

# read the simple data

data = pd.read_excel('wine_data.xlsx')

# split the train data and test data

data_tr = data[0:29]

data_te = data[29:]

# 初始化

N = len(data_tr)                           # N test simples

#data.iloc[ 0:2 ,1:2 ]  # take all the data that intersects rows 0-2 and columns 1-2

w = len(set(data.iloc[:,4]))               # w catagory

n = 3                                     # n characterristics

# count(1) is the number of ones in the list

N1 = list(data_tr['所属类别']).count(1)        # The number of the type 1 in the test simple

N2 = list(data_tr['所属类别']).count(2)        # The number of the type 2 in the test simple

N3 = list(data_tr['所属类别']).count(3)        # The number of the type 3 in the test simple

N4 = list(data_tr['所属类别']).count(4)        # The number of the type 4 in the test simple

#From 0 to 4 is in the first category ,so copy the three characteristics from 0 to 4 of the first category to A

A = data.iloc[:N1,1:4]                     # A belongs to w1

B = data.iloc[N1:N2+N1,1:4]                # B belongs to w2

C = data.iloc[N2+N1:N1+N2+N3,1:4]          # C belongs to w3

D = data.iloc[N1+N2+N3:N1+N2+N3+N4,1:4]    # D belongs to w4

# prior probablity

pw1 = N1/N

pw2 = N2/N

pw3 = N3/N

pw4 = N4/N

def get_p(A,pw):

    P_ls = []

    x1 = np.array(A.mean())                 # 求样本均值

    sigma2 = np.array(A.var())

    s1 = np.mat(A.cov())                    # 求样本协方差矩阵

    s1_ = s1.I                               # 求协方差矩阵的逆矩阵

    s11 = np.linalg.det(s1)                  # 求协方差矩阵的行列式

    for i in range(30):

        x_u = np.mat(data_te.iloc[i,1:4]-x1) # x-u

        # P1=-1/2*x_u*s1_*x_u.T+math.log(pw)-1/2*math.log(s11)

        P1 = 1/(np.sqrt(2*np.pi*sigma2))*(np.exp(-x_u*x_u.T/(2*sigma2)).T)*pw

        P_ls.append(P1)



    return P_ls

#Determine the correct number

cnt = 0

P1 = get_p(A,pw1)

P2 = get_p(B,pw2)

P3 = get_p(C,pw3)

P4 = get_p(D,pw4)

for i in range(30):

    P = [P1[i],P2[i],P3[i],P4[i]]

    data.iloc[i+29,5] = P.index(max(P))+1

    if data.iloc[i+29,5] == data.iloc[i+29,4]:

        cnt += 1

accuracy = cnt/len(data_te)

print("精确率：",accuracy)

data.to_excel('result_new.xlsx',index = None)

print(data)

方式三：与方法二一样，不过在此基础上将函数封装起来

运行结果：

图表 5判别结果

图表 6正确率

代码如下：

import time

import math

import numpy as np

import pandas as pd

from sklearn.metrics import accuracy_score



np.set_printoptions(suppress=True)





class My_Gaussian(object):



    def __init__(self):

        pass



    def fit(self, X_train, y_train):

        self.train_data = X_train.copy() #将训练集拷贝

        self._y_train = pd.DataFrame(y_train.values, columns=['label'])#训练集的类

        self.train_data['label'] = y_train.values #将训练集的类放入，以方便之后用

        self.mean_mat = self.train_data.groupby("label").mean() #对每个类的各个特征求均值

        self.var_mat = self.train_data.groupby("label").var() #对每个类的各个特征求方差

        self.prior_rate = self.__Calculate_prior_probability() #先验概率

        return self



    #预测

    def predict(self, X_test):

        pred = [self.__Condition_formula(self.mean_mat, self.var_mat, row) * self.prior_rate for row in X_test.values]  # 得到正太分布后的后验概率

        class_result = np.argmax(pred,axis=1)  # 得到最大值，并将索引分类。

        class_result += np.ones(len(class_result)).astype(int)

        return class_result



    # 先验概率

    def __Calculate_prior_probability(self):

        la = self._y_train['label'].value_counts().sort_index()

        prior_rate = np.array([i / sum(la) for i in la])

        return prior_rate



    # 高斯贝叶斯条件公式

    def __Condition_formula(self, mu, sigma2, row):#row表示每个测试集的三个特征，sigma2表示的是每个类的方差，mu表示每个类的均值

        P_mat = 1 / np.sqrt(2 * math.pi * sigma2) * np.exp(-(row - mu) ** 2 / (2 * sigma2)) #通过正太分布得到类条件概率p(x|wi)

        P_mat = pd.DataFrame(P_mat).prod(axis=1) #对类条件概率乘积:p(x1|wi)*p(x2|wi)*p(x3|wi)

        return P_mat

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split



start_time = time.time()

#导入数据

data_all = pd.read_excel('wine_data.xlsx')

# split the train data and test data

data = data_all.iloc[:,1:4]

target = data_all.iloc[:,4]

# #拆分训练集和测试集

train_size = len(data[0:29])/len(data)

X_train,X_test,y_train,y_test = train_test_split(data,target,train_size=train_size,shuffle=False)

#进行训练

NB = My_Gaussian()

NB.fit(X_train, y_train)

#预测

y_train_NB = NB.predict(X_train)

y_test_NB = NB.predict(X_test)

data_all.iloc[29:,5] = y_test_NB[:]

print("训练集评分： ",accuracy_score(y_train,y_train_NB ),"\n测试集评分 ",accuracy_score(y_test,y_test_NB))



data_all.to_excel('result_new1.xlsx',index = None)

print(data_all)