LDA降维即线性辨别分析,一般情况下都是吧数据降维至1-(C-1)维 其中C为总类别数,LDA降维的目标维度不可控
计算过程:
计算类内散度,散度矩阵是m∗m的对称矩阵,m是特征个数
首先计算类内均值 ,对每一类中的每条数据减去均值进行矩阵乘法,最后相加,就是类内散度矩阵
def get_inclass_sw(self, cluster):
m = self.dataset.shape[1]
s_w = np.zeros((m, m))
m_vec = self.get_class_mean(cluster)
for i, v in zip(range(1, cluster + 1), m_vec):
sc_mat = np.zeros((m, m))
for data in self.dataset[self.label == i, :]:
data, v = data.reshape(4, 1), v.reshape(4, 1)
sc_mat += (data - v).dot((data - v).T)
s_w += sc_mat
return s_w
计算类间散度矩阵,用总体均值代替类均值。
def get_between_class_sb(self, cluster):
m = self.dataset.shape[1]
mean = np.mean(self.dataset, axis=0)
m_vec = self.get_class_mean(cluster)
s_b = np.zeros((m, m))
for i, v in zip(range(1, cluster + 1), m_vec):
n = self.dataset[self.label == i, :].shape[0]
v = v.reshape(4, 1)
mean = mean.reshape(4, 1)
s_b += n * (v - mean).dot((v - mean).T)
return s_b
进行LDA运算
SW 是类内散度矩阵,SB 是类间散度矩阵
计算(SW)^-1 *SB的特征值,特征向量
SB的秩为C−1
数据拥有有d个特征
特征值为0的有:d−C+1
所以只考虑前C−1个不为0的特征值的特征向量
def lda(self, cluster):
s_w = self.get_inclass_sw(cluster)
s_b = self.get_between_class_sb(cluster)
e, e_v = np.linalg.eig(np.linalg.inv(s_w).dot(s_b))
e_pairs = [(np.abs(e[i]), e_v[:, i]) for i in range(len(e))]
e_pairs = sorted(e_pairs, key=lambda k: k[0], reverse=True)
W = np.hstack((e_pairs[0][1].reshape(4, 1), e_pairs[1][1].reshape(4, 1)))
return W
测试和其余代码(数据集等:
import numpy as np
def read_data():
from sklearn.datasets import load_iris
data_set = load_iris()
data_x = data_set.data
label = data_set.target + 1
return data_x, label
class LDA:
def __init__(self, dataset, label):
self.dataset = dataset
self.label = label
def get_class_mean(self, cluster):
m_vec = []
for i in range(1, cluster + 1):
m_vec.append(np.mean(self.dataset[self.label == i], axis=0))
return m_vec
test_data, label = read_data()
l = LDA(test_data, label)
print(l.get_inclass_sw(3))
print(l.get_between_class_sb(3))
print(l.dataset.dot(l.lda(3)))