参考文章:https://blog.csdn.net/qq_30091945/article/details/81508055
作为机器学习的小白,最近将GDA给简单实现了,有很多不足的地方,欢迎大家指出。
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Author:
# Time:2021-11-21
# 参考文章:https://blog.csdn.net/qq_30091945/article/details/81508055
import numpy as np
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# 高斯判别分析的类
class GDA:
def __init__(self,train_data,train_labels):
self.train_data = train_data
self.train_labels = train_labels
self.bonuli = 0 # y对应的伯努利参数
self.miu0 = 0 # y=0时对应的高斯分布的参数u0
self.miu1 = 0 # y=1时对应的高斯分布的参数u1
self.cov = 0 # 高斯分布的协方差矩阵
# 计算各个参数
def cal_param(self):
train_data = self.train_data
train_labels = self.train_labels
positive_num = 0 # y=1的数量
negative_num = 0 # y=0的数量
miu0 = 0. # μ0参数
miu1 = 0. # μ1参数
sum_x1 = 0. # y=1时x的求和
sum_x0 = 0. # y=0时x的求和
for i in range(len(train_labels)):
if train_labels[i]:
positive_num += 1
sum_x1 += train_data[i]
else:
negative_num += 1
sum_x0 += train_data[i]
miu0 += (1-train_labels[i])*train_data[i]
miu1 += train_labels[i]*train_data[i]
# 计算y的伯努利参数
bonuli = positive_num/(positive_num+negative_num)
# 计算μ0
miu0 = miu0/negative_num
# 计算μ1
miu1 = miu1/positive_num
# 计算协方差矩阵,即cov参数
s1 = 0 # y=1的方差
s0 = 0 # y=0的方差
train_data = np.mat(train_data) # 方便矩阵运算
sum_x0 = np.mat(sum_x0) # 行向量
sum_x1 = np.mat(sum_x1) # 行向量
for i in range(len(train_labels)):
if train_labels: # 这里的方差没有除以N1,因为后面要乘以N1,同时由于原来的向量是行向量,所以先转置
s1 += (train_data[i]-1.0/positive_num*sum_x1).T*(train_data[i]-1.0/positive_num*sum_x1)
else: # 这里的方差没有除以N0,因为后面要乘以N0
s0 += (train_data[i] - 1.0 / negative_num * sum_x0).T * (train_data[i] - 1.0 / negative_num * sum_x0)
cov = (s0+s1)/(negative_num+positive_num)
self.bonuli = bonuli
self.miu0 = np.mat(miu0)
self.miu1 = np.mat(miu1)
self.cov = np.mat(cov)
# 模型,即高斯密度函数
def gauss_model(self,x,miu,cov):
dim = np.shape(cov)[0] # 这里不能用len,否则算出来的就是行的数量了
cov_det = np.linalg.det(cov+np.eye(dim)*0.001) # 为了防止det为0,但为啥要在对角线加元素就可以了呢?
cov_inv = np.linalg.inv(cov+np.eye(dim)*0.001)
prob = 1.0/np.power(np.power(2*np.pi,dim)*np.abs(cov_det),0.5)*np.exp(-0.5*(x-miu)*cov_inv*(x-miu).T) # 求的x和μ本来是行向量
return prob
# 预测
def prediction(self,test_data,test_labels):
self.cal_param()
predict_labels = []
for i in range(np.shape(test_labels)[0]):
positive_prob = self.gauss_model(test_data[i],self.miu1,self.cov)*self.bonuli
negative_prob = self.gauss_model(test_data[i],self.miu0,self.cov)*(1-self.bonuli)
if positive_prob >= negative_prob:
predict_labels.append(1)
else:
predict_labels.append(0)
return predict_labels
# 损失函数部分
# 没有损失函数,因为分类结果只有0,1不好有损失函数
# 画图部分
# GDA 是由概率求出的,所以我不会画图来显示分界线
# 数据加载部分
def load_data():
fp = open(r'E:/data/textset.txt', encoding='UTF-8')
line = fp.readline()
datax = []
labels = []
while line:
data = list(map(float,line.split()))
datax.append([data[0],data[1]])
labels.append(int(data[2])) # 这里labels是一个行数列
line = fp.readline()
datax = np.array(datax)
fp.close()
return datax,labels
# 主函数
def main():
datax ,labels = load_data()
train_data,test_data,train_label,test_label = train_test_split(datax,labels,test_size=0.1,random_state=None)
# print(type(test_data),type(test_label)) # 这里得到的data是array形式的,label是list
# GDA的结果
gda = GDA(train_data,train_label)
gda_predict_labels = gda.prediction(test_data,test_label)
print("高斯判别分析的准确率为:",accuracy_score(test_label,gda_predict_labels))
# logistic regression的结果
lr = LogisticRegression()
lr.fit(train_data,train_label)
lr_predict_labels = lr.predict(test_data)
print("逻辑回归的准确率为:",accuracy_score(test_label,lr_predict_labels))
if __name__ == '__main__': # 入口
main()
结果显示:
小结:
由于公式都是数学推导已经得到了,而且这个代码的泛化能力很弱,所以整体代码并不难。
对于GDA和LR,GDA需要p(x|y)是高斯分布的,p(y)是伯努利分布的,所以对数据要求更高,但在数据比较少,维度比较低的情况下效果更好
LR需要比较弱的假设,数据不需要满足高斯分布,在数据较大下效果会更好一点
数据集:
-0.017612 14.053064 0
-1.395634 4.662541 1
-0.752157 6.538620 0
-1.322371 7.152853 0
0.423363 11.054677 0
0.406704 7.067335 1
0.667394 12.741452 0
-2.460150 6.866805 1
0.569411 9.548755 0
-0.026632 10.427743 0
0.850433 6.920334 1
1.347183 13.175500 0
1.176813 3.167020 1
-1.781871 9.097953 0
-0.566606 5.749003 1
0.931635 1.589505 1
-0.024205 6.151823 1
-0.036453 2.690988 1
-0.196949 0.444165 1
1.014459 5.754399 1
1.985298 3.230619 1
-1.693453 -0.557540 1
-0.576525 11.778922 0
-0.346811 -1.678730 1
-2.124484 2.672471 1
1.217916 9.597015 0
-0.733928 9.098687 0
-3.642001 -1.618087 1
0.315985 3.523953 1
1.416614 9.619232 0
-0.386323 3.989286 1
0.556921 8.294984 1
1.224863 11.587360 0
-1.347803 -2.406051 1
1.196604 4.951851 1
0.275221 9.543647 0
0.470575 9.332488 0
-1.889567 9.542662 0
-1.527893 12.150579 0
-1.185247 11.309318 0
-0.445678 3.297303 1
1.042222 6.105155 1
-0.618787 10.320986 0
1.152083 0.548467 1
0.828534 2.676045 1
-1.237728 10.549033 0
-0.683565 -2.166125 1
0.229456 5.921938 1
-0.959885 11.555336 0
0.492911 10.993324 0
0.184992 8.721488 0
-0.355715 10.325976 0
-0.397822 8.058397 0
0.824839 13.730343 0
1.507278 5.027866 1
0.099671 6.835839 1
-0.344008 10.717485 0
1.785928 7.718645 1
-0.918801 11.560217 0
-0.364009 4.747300 1
-0.841722 4.119083 1
0.490426 1.960539 1
-0.007194 9.075792 0
0.356107 12.447863 0
0.342578 12.281162 0
-0.810823 -1.466018 1
2.530777 6.476801 1
1.296683 11.607559 0
0.475487 12.040035 0
-0.783277 11.009725 0
0.074798 11.023650 0
-1.337472 0.468339 1
-0.102781 13.763651 0
-0.147324 2.874846 1
0.518389 9.887035 0
1.015399 7.571882 0
-1.658086 -0.027255 1
1.319944 2.171228 1
2.056216 5.019981 1
-0.851633 4.375691 1
-1.510047 6.061992 0
-1.076637 -3.181888 1
1.821096 10.283990 0
3.010150 8.401766 1
-1.099458 1.688274 1
-0.834872 -1.733869 1
-0.846637 3.849075 1
1.400102 12.628781 0
1.752842 5.468166 1
0.078557 0.059736 1
0.089392 -0.715300 1
1.825662 12.693808 0
0.197445 9.744638 0
0.126117 0.922311 1
-0.679797 1.220530 1
0.677983 2.556666 1
0.761349 10.693862 0
-2.168791 0.143632 1
1.388610 9.341997 0
0.317029 14.739025 0