FDA原理
FDA代码
此次的数据来源为二进制文件t10k-images-idx3-ubyte,t10k-labels-idx1-ubyte,train-images-idx3-ubyte,train-labels-idx1-ubyte,分别为测试集数据、 测试集标签、 训练集数
据、训练集标签。读入数据之后,对数据进行标准化。代码如下:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mnist import MNIST
import sys
def normalization(x):
[n,m] = np.shape(x)
mean = np.mean(x,axis = 1) #row mean
mean_mat = np.array([mean]*m).T
std = np.std(x,axis = 1) #row std
std_mat = np.array([std]*m).T
return (x-mean_mat)/std_mat
def load_mnist(only_binary = True,y_1 = 6,y_2 = 8):
sys.path.append('E:\\2016.11\work\PRML')
mndata = MNIST('E:\\2016.11\work\PRML')
[x_train,y_train] = mndata.load_training() #get train data
[x_test,y_test] = mndata.load_testing() #get test data
[n_train,m_train] = np.shape(x_train)
[n_test,m_test] = np.shape(x_test)
if only_binary:
x_train = [x_train[i] for i in xrange(n_train) if (y_train[i] == y_1) or (y_train[i] == y_2)]
y_train = [y_train[i] for i in xrange(n_train) if (y_train[i] == y_1) or (y_train[i] == y_2)]
x_test = [x_test[i] for i in xrange(n_test) if (y_test[i] == y_1) or (y_test[i] == y_2)]
y_test = [y_test[i] for i in xrange(n_test) if (y_test[i] == y_1) or (y_test[i] == y_2)]
x_train = normalization(x_train)
x_test = normalization(x_test)
print('data-reading success')
print('x_train:',np.size(x_train))
return x_train,y_train,x_test,y_test
print('reading data.....')
x_train,y_train,x_test,y_test = load_mnist()
基于Fisher判别法的模型训练
得到数据集
x1
,
x2
,
y1
,
y2
之后,则可以代入公式(13),从而算出
w
,代入公式(13),从
而算出
import numpy as np
from numpy.linalg import pinv
def FDA_train(x_1,x_2):
n_1, d_1 = np.shape(x_1)
n_2, d_2 = np.shape(x_2)
m_1 = np.mean(x_1,axis = 0)
m_2 = np.mean(x_2,axis = 0)
S_1 = np.zeros((d_1,d_1))
for i in range(n_1):
S_1 += np.mat(x_1[i] - m_1).T * np.mat(x_1[i] - m_1)
S_2 = np.zeros((d_2,d_2))
for i in range(n_2):
S_2 += np.mat(x_2[i] - m_2).T * np.mat(x_2[i] - m_2)
S_w = S_1 + S_2
w_star = pinv(S_w) * np.mat(m_1 - m_2).T
y_1 = np.mat(x_1) * w_star
y_2 = np.mat(x_2) * w_star
m_1_tilde = y_1.sum() / float(n_1)
m_2_tilde = y_2.sum() / float(n_2)
w_0 = -(m_1_tilde + m_2_tilde) / float(2)
return w_star,w_0
def FDA_test(x_test, w_star, w_0):
y_proj = x_test * w_star
y_1 = y_proj >= w_0
y_2 = y_proj < w_0
y_pred = np.hstack((y_1,y_2)).astype(int)
return y_pred
对于上述算出的
w
,
通过公式(14)计算错误率。
import numpy as np
from Load import load_mnist
from FDA import FDA_test,FDA_train
y_1 = 6;y_2 = 8
x_train, y_train, x_test, y_test = load_mnist(only_binary=True,y_1=y_1,y_2=y_2)
x_train_0 = np.array([x_train[i] for i in xrange(len(y_train)) if y_train[i] == y_1])
x_train_1= np.array([x_train[i] for i in xrange(len(y_train)) if y_train[i] == y_2])
w_star, w_0 = FDA_train(x_train_0, x_train_1)
y_pred = FDA_test(x_test, w_star, w_0)
y_0 = (np.mat(y_test) == y_1).astype(int).T
y_1 = (np.mat(y_test) == y_2).astype(int).T
y_test = np.hstack((y_0,y_1))
error_rate = abs(y_pred - y_test).sum() / (2 * len(y_test))
print("error rate is: %.4f" % error_rate)