LDA过程
- 计算每个类别的均值 μ i \mu_i μi,全局样本均值 μ \mu μ
- 计算类内散度矩阵 S w S_w Sw,全局散度矩阵 S t S_t St,类间散度矩阵 S b S_b Sb
- 对矩阵 S w − 1 S b S_{w}^{-1}S_b Sw−1Sb做特征值分解
- 取最大的d’个特征值所对应的特征向量
- 计算投影矩阵
LDA实现
数据的导入和预处理
df_wine = pd.read_csv('wine.data',header=None)
df_wine.columns = ['Class label','Alcohol','Mail acid','Ash',
'Alcalinity of ash','Magnesium','Total phenols',
'Flavanoids','Nonflavanoid phenols','Proanthocyanins',
'Color intensity','Hue',
'OD280/OD315 of diluted wines','Proline']
X, y = df_wine.iloc[:,1:].values,df_wine.iloc[:,0].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,
stratify=y,
random_state=0)
sc = StandardScaler()
#对数据集进行标准化(一般情况下我们在训练集中进行均值和方差的计算,直接在测试集中使用)
X_train_std = sc.fit_transform(X_train)#先fit,训练sc参数,计算出均值和方差,再transform,使数据标准化、归一化
X_test_std = sc.transform(X_test)
np.set_printoptions(precision=4) #进度设置,浮点数
- 计算平均向量
#计算每一类数据的平均向量
mean_vecs = []
for label in range(1,4):
mean_vecs.append(np.mean(X_train_std[y_train == label],axis=0))
print('MV %s: %s\n' % (label,mean_vecs[label-1]))
- 求类内散度矩阵 S W S_W SW和类间散度矩阵 S B S_B SB
#特征维度
d = 13
S_W = np.zeros((d,d))
#获取每个类别的平均值向量
for label, mv in zip(range(1,4),mean_vecs):
#每一个类别的散度矩阵
class_scatter = np.zeros((d,d))
for row in X_train_std[y_train == label]:
row, mv = row.reshape(d,1),mv.reshape(d,1)
class_scatter += (row - mv).dot((row-mv).T)
#每个类别散度矩阵之和
S_W += class_scatter
mean_overall = np.mean(X_train_std,axis=0) #全局平均值
d = 13
S_B = np.zeros((d,d))
for i,mean_vec in enumerate(mean_vecs): #获取每个类别的平均值
n = X_train[y_train == i+1,:].shape[0]
mean_vec = mean_vec.reshape(d,1) #列向量
mean_overall = mean_overall.reshape(d,1)
S_B += n * (mean_vec - mean_overall).dot((mean_vec - mean_overall).T) #类间散度矩阵
- 特征分解
#计算LDA的特征值
eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B)) #inv为矩阵求逆
查看特征值分布
eigen_pairs = [(np.abs(eigen_vals[i]),eigen_vecs[:,i])
for i in range(len(eigen_vals))]
eigen_pairs = sorted(eigen_pairs,key=lambda k:k[0],reverse=True)
for eigen_val in eigen_pairs:
print(eigen_val[0])
tot = sum(eigen_vals.real)
discr = [(i / tot) for i in sorted(eigen_vals.real,reverse=True)]
cum_discr = np.cumsum(discr)
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.bar(range(1,14),discr,alpha=0.5,align='center',label='“区分度”分布')
plt.step(range(1,14),cum_discr,where='mid',label='累计“区分度”')
plt.ylabel('“区分度”比例')
plt.xlabel('特征维度')
plt.ylim([-0.1,1.1])
plt.legend(loc='best')
plt.show()
- 降维
w = np.hstack((eigen_pairs[0][1][:,np.newaxis].real,
eigen_pairs[1][1][:,np.newaxis].real))
X_train_lda = X_train_std.dot(w)
显示降维结果
colors = ['r','b','g']
markers = ['s','x','o']
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
for l,c,m in zip(np.unique(y_train),colors,markers):
plt.scatter(X_train_lda[y_train == l,0],
X_train_lda[y_train == l,1]*(-1),
c=c,label=l,marker=m)
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc='lower right')
plt.show()