线性判别—Fisher判别(LDA)
'''
#
#第一步,求取各类样本的均值向量
#第二步,求类内离散度矩阵(协方差矩阵)
#第三步,求总类内离散度矩阵S_w(协方差矩阵相加)
#第四步,求w,S_w的逆矩阵与均值向量的矩阵乘法
#第五步,求阈值y0,根据实际决策分类
#第六步,对测试集进行预测
#
'''
import numpy as np
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
#
#sklearn提供的乳腺癌数据集
#569个样本
#30个样本特征(属性)data
#最后一个样本特征为是否为患者 target,是一个典型的二分类任务数据集
#详细数据参考官方文档
#
from sklearn.model_selection import train_test_split
x = breast_cancer['data'] # 特征属性
y = breast_cancer['target'] # 标签
#随机采样,将20%的数据作为测试样本
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2)
def get_mean_vector(target):
'''
求均值向量
:param target(0或1):
:return:
'''
m_target_list = [0 for i in range(x_train.shape[1])]
count = 0
for i in range(x_train.shape[0]): # 遍历每一个样本
if y_train[i] == target: # 选择所选标签类的样本
t_list = x_train[i].tolist()
m_target_list = [t_list[t] + m_target_list[t] for t in range(x_train.shape[1])]
u = np.array(np.mean(np.matrix(m_target_list), axis=0)).flatten()
return u
计算类内离散度矩阵:
def get_dispersion_matrix(target, mean_vector):
'''
求样本内离散度矩阵
:param target:
:param mean_vector:
:return:
'''
s_target_matrix = np.zeros((x_train.shape[1], x_train.shape[1]))
for i in range(x_train.shape[0]): # 遍历每一个样本
if y_train[i] == target: # 选择目标类的样本
temp = np.multiply( (x_train[i] - mean_vector), (x_train[i] - mean_vector).transpose() ) #(x - u)*(x - u)的转置 广播规则
s_target_matrix = s_target_matrix + temp
return s_target_matrix
拉格朗日乘子法求解w:
def get_w(dispersion_matrix, mean_vector1, mean_vector2):
'''
求Fisher准则函数的w解
:param dispersion_matrix(Sw):
:param mean_vector1(m0):
:param mean_vector2(m1):
:return:
'''
u, s, v = np.linalg.svd(dispersion_matrix) # 奇异值分解,否则所得矩阵为奇异矩阵,无法求逆
S_w_inv = np.dot(np.dot(v.T, np.linalg.inv(np.diag(s))), u.T)
#S_w_inv = np.linalg.inv(dispersion_matrix)
return np.dot(S_w_inv, (mean_vector1 - mean_vector2))
分割阈值y0的选择:
def get_segmentation_threshold(w):
'''
求分割阈值
:param w:
:return:
'''
y0_list = []
y1_list = []
for i in range(x_train.shape[0]):
if y_train[i] == 0:
y0_list.append(np.matmul(w.transpose(), x_train[i]))# 第一类每个样本在w上的投影值
else:
y1_list.append(np.matmul(w.transpose(), x_train[i]))# 第二类每个样本在w上的投影值
ny0 = len(y0_list)
ny1 = len(y1_list)
my0 = sum(y0_list) / ny0
my1 = sum(y1_list) / ny1
segmentation_threshold = (ny0 * my0 + ny1 * my1) / (ny0 + ny1)
return segmentation_threshold
给定样本x计算出在w上的投影点:
根据实际问题,进行决策分类:
def test_single_smaple_check(w, y0, test_sample, test_target):
'''
单例检测
:param w:
:param y0:
:param test_sample:
:param test_target:
:return:
'''
y_test = np.matmul(w.transpose(), test_sample)
prediction = 1
if y_test > y0:
prediction = 0
if test_target == prediction:
return True
else:
return False
def test_check(w, y0):
'''
统计全部测试样本
:param w:
:param y0:
:return: 测试样本数,预测正确的样本数,预测准确率
'''
right_count = 0
for i in range(x_test.shape[0]):
boolean = test_single_smaple_check(w, y0, x_test[i], y_test[i])
if boolean == True:
right_count = right_count + 1
return x_test.shape[0], right_count, right_count / x_test.shape[0]
#fisher判别
u0 = get_mean_vector(0)
u1 = get_mean_vector(1)
s0 = get_dispersion_matrix(0, u0)
s1 = get_dispersion_matrix(1, u1)
S_w = s0 + s1
w = get_w(S_w, u0, u1)
y0 = get_segmentation_threshold(w)
test_sum, right_sum, accuracy = test_check(w, y0)
print("Total specimen number:{}\nNumber of correctly predicted samples:{}\nAccuracy:{}\n".format(test_sum, right_sum, accuracy))
Out:(对于数据集的不同划分会得到不同的结果)
Total specimen number:114
Number of correctly predicted samples:95
Accuracy:0.8333333333333334
参考文章:
布衣小张. 『矩阵论笔记』线性判别分析(LDA)最全解读+python实战二分类代码+补充:矩阵求导可以参考[EB/OL]. https://blog.csdn.net/abc13526222160/article/details/90611743
孑渡. 基于Python的Fisher二分类判别模型实现[EB/OL]. https://blog.csdn.net/qq_44459787/article/details/109755888