我选的是sonar数据集进行试验
导入数据
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 导入sonar.all-data数据集
sonar = pd.read_csv('sonar.all-data', header=None, sep=',')
sonar
数据格式如下:
数据处理
# 数据处理
sonar1 = sonar.iloc[0:208,0:60]
sonar2 = np.mat(sonar1) # 创建矩阵
定义Fisher函数
def LDA(X1,X2,n):
# 计算两类样本的类均值向量
m1 = (np.mean(X1, axis=0)) # 按行计算的均值
m2 = (np.mean(X2, axis=0))
# 将行向量转换为列向量以便于计算
m1 = m1.reshape(n,1)
m2 = m2.reshape(n,1)
# 计算类内离散度矩阵
S1 = np.zeros((n,n))
S2 = np.zeros((n,n))
for i in range(0,97):
S1 += (X1[i].reshape(n,1)-m1).dot((X1[i].reshape(n,1)-m1).T)
for i in range(0,110):
S2 += (X2[i].reshape(n,1)-m2).dot((X2[i].reshape(n,1)-m2).T)
# 计算总类内离散矩阵S_w
S_w = S1 + S2
# 计算最优投影方向W
W = np.linalg.inv(S_w).dot(m1-m2) # linalg.inv()矩阵求逆
return W
def Class(X,W):
y = (W.T).dot(X)
return y
进行试验
G1 = np.ones(98)
G2 = np.ones(110)
p1 = sonar2[0:97, 0:37]
p2 = sonar2[97:208, 0:37]
W = LDA(p1,p2,37)
for i in range(207):
if i<= 96:
test = p1[i]
test = test.reshape(37,1)
G1[i] = Class(test,W)
else:
test = p2[i-97]
test = test.reshape(37,1)
G2[i-97] = Class(test,W)
y1 = np.zeros(98)
y2 = np.zeros(110)+0.2
plt.figure(1)
plt.ylim((-0.5,0.5))
plt.xlim((-0.1,0.1))
plt.scatter(G1,y1,c='red',alpha=1,marker='.',label='G1')
plt.scatter(G2,y2,c='k',alpha=1,marker='.',label='G2')
plt.legend()
plt.show()
输出如下:
我们发现其分类效果不是太好,我便参考Mr_Lowbee的资料进行测试。
加了留一法。
代码如下:
def Fisher(X1,X2,n,c):
# 计算两类样本的类均值向量
m1 = (np.mean(X1, axis=0)) # 按行计算的均值
m2 = (np.mean(X2, axis=0))
# 将行向量转换为列向量以便于计算
m1 = m1.reshape(n,1)
m2 = m2.reshape(n,1)
# 计算类内离散度矩阵
S1 = np.zeros((n,n))
S2 = np.zeros((n,n))
if c == 0:
for i in range(0,96):
S1 += (X1[i].reshape(n,1)-m1).dot((X1[i].reshape(n,1)-m1).T)
for i in range(0,111):
S2 += (X2[i].reshape(n,1)-m2).dot((X2[i].reshape(n,1)-m2).T)
if c == 1:
for i in range(0,97):
S1 += (X1[i].reshape(n,1)-m1).dot((X1[i].reshape(n,1)-m1).T)
for i in range(0,110):
S2 += (X2[i].reshape(n,1)-m2).dot((X2[i].reshape(n,1)-m2).T)
# 计算总类内离散矩阵S_w
S_w = S1 + S2
# 计算最优投影方向W
W = np.linalg.inv(S_w).dot(m1-m2) # linalg.inv()矩阵求逆
# 在投影后的一维空间求两类的均值
m_1 = (W.T).dot(m1)
m_2 = (W.T).dot(m2)
# 计算分类阈值W0(为一个列向量)
W0 = -0.5*(m_1 + m_2)
return W,W0
def Classify(X,W,W0):
y = (W.T).dot(X) + W0
return y
# 观察图像发现当取维度n=37时,准确率与60一样,考虑计算量就选取n=37
G1 = np.ones(98)
G2 = np.ones(110)
p1 = sonar2[0:97, 0:37]
p2 = sonar2[97:208, 0:37]
count = 0
for i in range(207):
if i <= 96:
test = p1[i]
test = test.reshape(37,1)
train = np.delete(p1,i,axis=0)
W,W0 = Fisher(train,p2,37,0)
if (Classify(test,W,W0)) >= 0:
count += 1
G1[i] = Classify(test,W,W0)
else:
test = p2[i-97]
test = test.reshape(37,1)
train = np.delete(p2,i-97,axis=0)
W,W0 = Fisher(p1,train,37,1)
if (Classify(test,W,W0)) < 0:
count += 1
G2[i-97] = Classify(test,W,W0)
y1 = np.zeros(98)
y2 = np.zeros(110)+0.2
plt.figure(1)
plt.ylim((-0.5,0.5))
plt.xlim((-0.01,0.01))
plt.scatter(G1,y1,c='red',alpha=1,marker='.',label='G1')
plt.scatter(G2,y2,c='k',alpha=1,marker='.',label='G2')
plt.legend()
plt.show()
输出如下:
效果明显提升。