一、线性判别分析的定义
二、线性判别分析——二分类模型
lda2classify.py
import numpy as np
class LDABinaryClassifier:
"""
线性判别分析二分类模型
"""
def __init__(self):
self.mu = None # 各类别均值向量
self.Sw_i = None # 各类内散度矩阵
self.Sw = None # 类内散度矩阵(within-class scatter matrix)
self.weight = None # 模型的系数,投影方向
self.w0 = None # 阈值
def fit(self, x_train, y_train):
"""
线性判别分析核心算法,计算投影方向及判别阈值
:param x_train: 训练集
:param y_train: 目标集
:return:
"""
x_train, y_train = np.asarray(x_train), np.asarray(y_train)
class_values = np.sort(np.unique(y_train)) # 不同的类别取值
n_samples, n_features = x_train.shape # 样本量和特征变量数
class_size = [] # 计算各类别的样本量
if len(class_values) != 2:
raise ValueError("仅限于二分类且线性可分数据集......")
# 1. 计算类均值,Sw散度矩阵,Sb散度矩阵
self.Sw_i = dict() # 字典形式,以类别取值为键,值是对应的类别样本的类内散度矩阵
self.mu = dict() # 字典形式,以类别取值为键,值是对应的类别样本的均值向量
self.Sw = np.zeros((n_features, n_features))
for label_val in class_values:
class_x = x_train[y_train == label_val] # 按类别对样本进行划分
class_size.append(class_x.shape[0]) # 该类别的样本量
self.mu[label_val] = np.mean(class_x, axis=0) # 对特征取均值构成均值向量
self.Sw_i[label_val] = (class_x - self.mu[label_val]).T.dot(class_x - self.mu[label_val])
self.Sw += self.Sw_i[label_val] # 累加计算类内散度矩阵
# print(self.Sw)
# 2. 计算投影方向w
# u, sigma, v = np.linalg.svd(self.Sw) # 奇异值分解
# inv_sw = v * np.linalg.inv(np.diag(sigma)) * u.T # 求逆矩阵
inv_sw = np.linalg.inv(self.Sw)
self.weight = inv_sw.dot(self.mu[0] - self.mu[1]) # 投影方向
# print(self.weight)
# 3. 计算阈值w0
self.w0 = (class_size[0] * self.weight.dot(self.mu[0]) + class_size[1] * self.weight.dot(self.mu[1])) / n_samples
# print(self.w0)
return self.weight
def predict(self, x_test):
"""
根据测试样本
:param x_test:
:return:
"""
x_test = np.asarray(x_test)
y_pred = self.weight.dot(x_test.T) - self.w0
y_test_pred = np.zeros(x_test.shape[0], dtype=np.int64) # 初始测试样本的类别值
y_test_pred[y_pred < 0] = 1 # 小于阈值的为负类
return y_test_pred
test_lda2classify.py
from sklearn.datasets import load_iris, load_breast_cancer
from lda2classify import LDABinaryClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# iris = load_iris()
# X, y = iris.data[:100, :], iris.target[:100]
bc_data = load_breast_cancer()
X, y = bc_data.data, bc_data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=111, stratify=y)
lda = LDABinaryClassifier()
lda.fit(X_train, y_train)
y_test_pred = lda.predict(X_test)
print(classification_report(y_test, y_test_pred))
鸢尾花取前两类:
breast_cancer 数据集:
三、线性判别分析——多分类降维算法
lda_multi_dim_reduction.py
import numpy as np
import scipy as sp
class LDAMulti_DimReduction:
"""
线性判别分析多分类降维
"""
def __init__(self, n_components=2):
self.n_components = n_components # 降维后的维度
self.Sw, self.Sb = None, None
self.eig_values = None # 广义特征值
self.W = None # 投影矩阵
def fit(self, x_samples, y_target):
"""
线性判别分析多分类降维核心算法,计算投影矩阵
:param x_train:
:param y_train:
:return:
"""
x_samples, y_target = np.asarray(x_samples), np.asarray(y_target)
class_values = np.sort(np.unique(y_target)) # 不同的类别取值
n_samples, n_features = x_samples.shape # 样本量和特征变量数
self.Sw = np.zeros((n_features, n_features))
for i in range(len(class_values)):
class_x = x_samples[y_target == class_values[i]]
mu = np.mean(class_x, axis=0)
self.Sw += (class_x - mu).T.dot(class_x - mu)
mu_t = np.mean(x_samples, axis=0)
self.Sb = (x_samples - mu_t).T.dot(x_samples - mu_t) - self.Sw
self.eig_values, eig_vec = sp.linalg.eig(self.Sb, self.Sw)
# print(self.eig_values)
idx = np.argsort(self.eig_values)[::-1] # 从大到小
self.eig_values = self.eig_values[idx]
vec_sort = eig_vec[:, idx]
self.W = vec_sort[:, :self.n_components]
# print(self.W)
return self.W
def transform(self, x_samples):
"""
根据投影矩阵计算降维后的新样本数据
:param x_samples:
:return:
"""
if self.W is not None:
return x_samples.dot(self.W)
else:
raise ValueError("请先进行fit,构造投影矩阵,然后降维...")
def fit_transform(self, x_samples, y_target):
"""
计算投影矩阵并降维
:param x_samples:
:param y_target:
:return:
"""
self.fit(x_samples, y_target)
return x_samples.dot(self.W)
def variance_explained(self):
"""
解释方差比
:return:
"""
idx = np.argwhere(np.imag(self.eig_values) != 0)
if len(idx) == 0:
self.eig_values = np.real(self.eig_values)
ratio = self.eig_values / np.sum(self.eig_values)
return ratio[:self.n_components]
test_lda_dim_reduction.py
from sklearn.datasets import load_iris, load_wine, make_classification
from lda_multi_dim_reduction import LDAMulti_DimReduction
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
iris = load_iris()
X, y = iris.data, iris.target
# X, y = make_classification(n_samples=2000, n_features=20, n_informative=3, n_classes=5,
# n_redundant=0, n_clusters_per_class=1, class_sep=2, random_state=42)
#
# wine = load_wine()
# X, y = wine.data, wine.target
X = StandardScaler().fit_transform(X)
lda = LDAMulti_DimReduction(n_components=3)
lda.fit(X, y)
x_new = lda.transform(X)
print(lda.variance_explained())
plt.figure(figsize=(14, 5))
plt.subplot(121)
plt.scatter(x_new[:, 0], x_new[:, 1], marker="o", c=y)
plt.xlabel("PC1", fontdict={"fontsize": 12})
plt.ylabel("PC2", fontdict={"fontsize": 12})
plt.title("LDA Dimension Reduction (Myself)", fontdict={"fontsize": 14})
plt.grid(ls=":")
# plt.subplot(222)
# plt.scatter(x_new[:, 1], x_new[:, 2], marker="o", c=y)
# plt.xlabel("PC2", fontdict={"fontsize": 12})
# plt.ylabel("PC3", fontdict={"fontsize": 12})
# plt.title("LDA Dimension Reduction (Myself)", fontdict={"fontsize": 14})
# plt.grid(ls=":")
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X, y)
x_skl = lda.transform(X)
plt.subplot(122)
plt.scatter(x_new[:, 0], x_new[:, 1], marker="o", c=y)
plt.xlabel("PC1", fontdict={"fontsize": 12})
plt.ylabel("PC2", fontdict={"fontsize": 12})
plt.title("LDA Dimension Reduction (Sklearn)", fontdict={"fontsize": 14})
plt.grid(ls=":")
# plt.subplot(224)
# plt.scatter(x_new[:, 1], x_new[:, 2], marker="o", c=y)
# plt.xlabel("PC2", fontdict={"fontsize": 12})
# plt.ylabel("PC3", fontdict={"fontsize": 12})
# plt.title("LDA Dimension Reduction (Sklearn)", fontdict={"fontsize": 14})
# plt.grid(ls=":")
plt.tight_layout()
plt.show()
鸢尾花数据集:
降维后前两个主特征的解释方差比
[9.91212605e-01 8.78739503e-03]
红酒数据集:
降维后前两个主特征的解释方差比
[6.87478888e-01 3.12521112e-01]
使用make_classification创建数据集:
降维后前三个主特征的解释方差比
[0.47101585 0.44946339 0.07876534]