朴素贝叶斯、线性判别准则、二次判别准则的概率统计原理请看这一篇https://blog.csdn.net/To_be_to_thought/article/details/81223561
围绕上一篇原理部分用python编码如下:
import numpy as np
import math
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris #用于读入水仙花数据集
from sklearn.model_selection import train_test_split #用于数据集分割
#朴素贝叶斯分类器
#datas只有属性列,不包含标签列,labels为标签向量
#返回分类器参数mu和stds
def NaiveBayesianTrain(data,labels):
n,p=data.shape
K,prior=cal_K_Prior(labels)
mu=np.zeros((K,p)) #(K*p) mu[k,i]表示属于第K类样本集第i个属性的数学期望估值
stds=np.zeros((K,p)) #(K*p) covariances[k,i]表示属于第K类样本集第i个属性的标准差
for k in range(K):
for i in range(p):
series=data[labels==k,i]
mu[k,i]=np.mean(series)
stds[k,i]=np.std(series)
return mu,stds,K
#用于预测标签和真实标签计算预测精度
def train_accuracy(predict_labels,labels):
in_accuracy=(predict_labels==labels).astype(np.int).mean()
print("train accuracy: %0.2f%%" %(in_accuracy*100))
#朴素贝叶斯分类器
#输入:data,labels为训练数据集及标签向量;out_samples待预测数据集
#输出:待预测数据集的预测标签向量
def NaiveBayesianPredictor(data,labels,out_samples):
mu,stds,K = NaiveBayesianTrain(data,labels)
n,p=data.shape
conditional_probability =np.zeros((n,K))
for i in range(n):
for j in range(K):
conditional_probability[i,j]=-np.sum(np.log(stds[j,:]))-np.sum((data[i,:]-mu[j,:])**2/(2*stds[j,:]**2) )
predict_labels=np.argmax(conditional_probability,axis=1)
train_accuracy(predict_labels,labels)
out_samples=np.array(out_samples) # 测试集样本数m,m*p
m=out_samples.shape[0]
conditional_probability1 =np.zeros((m,K))
for i in range(m):
for j in range(K):
conditional_probability1[i,j]=-np.sum(np.log(stds[j,:]))-np.sum((out_samples[i,:]-mu[j,:])**2/(2*stds[j,:]**2))
predict_labels=np.argmax(conditional_probability1,axis=1)
return predict_labels
#根据训练数据计算标签类别数和先验分布
def cal_K_Prior(labels):
voteLabel={} #key:类别标签(0,1,2,...,K-1) value:属于该类别的样本数
for label in labels:
if label in voteLabel:
voteLabel[label]+=1
else:
voteLabel[label]=1
K=len(voteLabel)
n=len(labels)
prior=[]
for i in range(K):
prior.append(voteLabel[i]/n)
prior=np.array(prior).reshape((K,1))/n #类别分布的先验概率矩阵(向量) K*1
return K,prior
#线性决策分析
#输入:data,labels
#输出:返回判别系数包括一次项系数coefficients和常数项constant()
def LinearDiscriminateAnalysis(data,labels):
p,n=data.shape
K,prior=cal_K_Prior(labels)
#计算各类别期望向量和总协方差的估值
mu=[]
for i in range(K):
mu.append(np.mean(data[labels==i,],axis=0))
mu=np.array(mu).T #计算均值向量组成的列矩阵
covariance=np.cov(data.T) #计算协方差矩阵
constant=[] #计算常数项
for i in range(K):
mu_Kth=mu[:,i] #第k个均值向量p*1
constant.append(np.log(prior[i,0])-1/2*np.dot(np.dot(mu_Kth.T,np.linalg.inv(covariance)),mu_Kth))
constant=np.array(constant).reshape((1,K)) #1*K
coefficients=np.dot(np.linalg.inv(covariance),mu) #p*K
return coefficients,constant
def LDA_Predictor(data,labels,out_samples):
coefficients,constant=LinearDiscriminateAnalysis(data,labels)
out_samples=np.array(out_samples)
condition_Probablity=np.dot(data,coefficients)+constant
predict_labels=np.argmax(condition_Probablity,axis=1)
train_accuracy(predict_labels,labels)
out_samples=np.array(out_samples) # 训练集外样本数m,m*p
condition_Probablity=np.dot(out_samples,coefficients)+constant
predict_labels=np.argmax(condition_Probablity,axis=1)
return predict_labels
#平方决策分析
#输入:data,labels
#返回判别系数:二次项系数quadratic_coef、一次项系数linear_coef、常数项系数constant
def QuadraticDiscriminateAnalysis(datas,labels):
n,p=datas.shape
K,prior=cal_K_Prior(labels)
#计算每一类的均值和协方差
mu=[]
covariances=[]
for i in range(K):
mu.append(np.mean(datas[labels==i,],axis=0))
covariances.append(np.cov(datas[labels==i,].T))
mu=np.array(mu).T #计算均值向量组成的列矩阵
covariances=np.array(covariances) #K*p*p
constant=[] #计算常数项系数
linear_coef=[] #计算一次项系数
for i in range(K):
mu_Kth=mu[:,i]
constant.append(np.log(prior[i,0])-1/2*np.log(np.linalg.det(covariances[i,:,:]))-1/2*np.dot(np.dot(mu_Kth.T,np.linalg.inv(covariances[i,:,:])),mu_Kth))
linear_coef.append(np.dot(np.linalg.inv(covariances[i,:,:]),mu_Kth))
constant=np.array(constant).reshape((1,K)) #1*K
linear_coef=np.array(linear_coef).T #p*K
quadratic_coef=-1/2*np.linalg.inv(covariances) # K*p*p平方项系数
return quadratic_coef,linear_coef,constant
'''
constant=np.log(prior)-1/2*np.dot(np.dot(mu.T,np.linalg.inv(covariances)),mu)-1/2*np.log(np.linalg.det(covariances)) #
linear_coef=np.dot(np.linalg.inv(covariances),mu)
# (K*p*p)*(p*K)=(K*p*K) np.dot(np.linalg.inv(covariances),mu[:,i])的结果为linear_coef的每一页的第i列向量组成的矩阵的转置
quadratic_coef=-1/2*np.linalg.inv(covariances)
return quadratic_coef,linear_coef,constant
这一段想尝试矩阵*三维张量*矩阵直接把循环转成简洁的向量式编程的,但实际运算法则跟向量*三维张量*向量不太一致,遂放弃
'''
def QDA_Predictor(data,labels,out_samples):
quadratic_coef,linear_coef,constant=QuadraticDiscriminateAnalysis(data,labels)
K=len(set(labels))
#计算训练集上的预测精度
condition_Probablity=QuadraticTerm(quadratic_coef,data,K)+np.dot(data,linear_coef)+constant
predict_labels=np.argmax(condition_Probablity,axis=1)
train_accuracy(predict_labels,labels)
#预测测试集类别
out_samples=np.array(out_samples) # m*p`
condition_Probablity=QuadraticTerm(quadratic_coef,out_samples,K)+np.dot(out_samples,linear_coef)+constant
predict_labels=np.argmax(condition_Probablity,axis=1)
return predict_labels
def QuadraticTerm(quadratic_coef, samples,K):
samples=np.array(samples)
n,p=samples.shape
ret=np.zeros((n,K))
for i in range(n):
ret[i,:]=np.dot(np.dot(samples[i,:],quadratic_coef),samples[i,:].T)
return ret
if __name__=="__main__":
#导入数据集
iris = load_iris()
data=iris.data
labels=iris.target
#先随机抽样,在按照2:1切分数据集,用三种贝叶斯分类器进行分类,并交叉检验
train_data,test_data,train_labels,test_labels=train_test_split(data, labels,test_size=0.33, random_state=1)
a=LDA_Predictor(train_data,train_labels,test_data)
print("test accuracy: %0.2f%%" %((a==test_labels).astype(np.int).mean()*100))
b=QDA_Predictor(train_data,train_labels,test_data)
print("test accuracy: %0.2f%%" %((b==test_labels).astype(np.int).mean()*100))
c=NaiveBayesianPredictor(train_data,train_labels,test_data)
print("test accuracy: %0.2f%%" %((c==test_labels).astype(np.int).mean()*100))
测试结果为如下,QDA>NB>LDA