PCA算法
PCA算法及其原理介绍
在上一个文章PCA 算法原理与应用-解析集合-曲线拟合-机器学习图像识别(点这里)详细的介绍了 PCA算法的数学解释,但是代码不健全,这次代码专门发布了一下以供大家参考。
PCA算法python实现
import os
import struct
import sys
import numpy as np
import openpyxl
from xlwt import Workbook
# 获取均值矩阵 和 零均值化后的数据矩阵
def get_mean_matrix(Data):
m = Data.mean(axis=1) # 计算均值, axis = 1代表对各行求均值
m = m.reshape(m.shape[0], 1)
zm = Data - m # 把均值变为0
return m, zm
# 获取协方差矩阵
def get_convariance_matrix(zm):
if zm.shape[0] > zm.shape[1]:
# 由于样本特征维度远远大于样本数量,采用改进方法
C = np.matmul(zm.T, zm)
print("[(Data-Data.mean).T*(Data - Data.mean)].shape:", C.shape)
# print("[(Data-Data.mean).T*(Data - Data.mean)]:")
else:
C = np.dot(zm, zm.T)
print("[(Data-Data.mean)*(Data - Data.mean).T].shape:", C.shape)
# print("[(Data-Data.mean)*(Data - Data.mean).T]:")
C = C / zm.shape[1]
return C
# 获取 特征值矩阵,特征向量矩阵
def get_sorted_eign_vectors(cm):
eigenvalue, eigenvectors = np.linalg.eig(cm) # D是特征值,V为特征向量
Ind = np.argsort(eigenvalue) # 根据 特征值的大小排序 得到排序
Ind = Ind[::-1]
sorted_column_index = Ind[:eigenvectors.shape[0]]
sorted_eigenvectors = eigenvectors[:, sorted_column_index]
return sorted_eigenvectors
# 获取数据矩阵在新的基空间上的投影
def get_data_in_new_space(Data, new_space):
Data_in_new_space = np.dot(new_space, Data) # 所有样本数据降维后的表示 (Rx样本数量)
return Data_in_new_space
# 获取降维空间基,降维后的数据矩阵
def get_reduced_base_space(eigenVectors, R):
reduced_base_space = eigenVectors[:, : R].T # 取R最大特征值对应的特征来降维后的及空间矩阵
return reduced_base_space
# 将测试数据投影到降了维的新基空间,找出与最为匹配的数据,并返回其标签
def recognize(testImageArray, reduced_base_space, m, labels, new_data):
# 可解决中文路径不能打开问题
testImageArray = np.mat(np.array(testImageArray)) # from [[a],[b],...[c]] to [[[x]],[[x]],...]
differenceTestImage = testImageArray - m # 零均值化
# 5、加载一个测试图片,并利用C矩阵也把其投影为test_P
projectedTestImage = get_data_in_new_space(differenceTestImage, reduced_base_space) # 映射到新的R个基所描述的空间
# 6、计算test_P和P中每个样本的距离,选出最近的那个即可
index_close = 0
min_distance = sys.float_info.max
for i in range(0, new_data.shape[1]):
q = new_data[:, i]
q = q.reshape(q.shape[0], 1)
distance = np.linalg.norm(projectedTestImage - q)
if distance < min_distance:
min_distance = distance
index_close = i
return labels[index_close]
# 对单一样本进行识别测试
def test(test_label, Data, labels, index_label, reduced_base_space, m, Y):
testImageArray = Data[:, index_label]
testImageArray = testImageArray.reshape(testImageArray.shape[0], 1)
result_label = recognize(testImageArray, reduced_base_space, m, labels, Y)
# print(index_label, "test_label:", test_label, "result_label:", result_label, test_label == result_label)
return result_label == test_label
# 对6000个训练样本的数据集进行测试,并记录每一个维度上的识别正确率,并把结果保存在excel文件中。
def train(Data, labels, N):
ws = Workbook(encoding="utf-8")
if os.path.exists(RESULT_EXCEL_PATH):
work_book = openpyxl.load_workbook(RESULT_EXCEL_PATH)
work_sheet = work_book.get_sheet_by_name(SHEET_NAME_RESULT_EXCEL)
else:
work_book = openpyxl.Workbook() # 目录上不存在 xls 文件时,新建一个
_save_current_R(1)
default_sheet = work_book.get_sheet_by_name("Sheet")
work_book.remove(default_sheet) # 删除默认的Sheet
work_sheet = work_book.create_sheet(SHEET_NAME_RESULT_EXCEL)
work_sheet.cell(1, 1, "PCA维数")
work_sheet.cell(1, 2, "识别准确率")
work_book.save(RESULT_EXCEL_PATH)
print(work_sheet.title)
mean_matrix, zm = get_mean_matrix(Data)
cm = get_convariance_matrix(zm)
sorted_eigen_vectors = get_sorted_eign_vectors(cm)
current_R = _get_current_R()
count_of_train_data = labels.shape[0]
print("每一维上都进行测试的训练样本数据量%s个" % count_of_train_data)
for R in range(current_R, N):
reduced_base_space = get_reduced_base_space(sorted_eigen_vectors, R)
new_data = get_data_in_new_space(Data, reduced_base_space)
count_correct = 0
count = 0
for i in range(0, count_of_train_data):
if test(labels[i], Data, labels, i, reduced_base_space, mean_matrix, new_data):
count_correct += 1
count += 1
print("\r维度R=%d 正在进行测试数据,计算识别正确率,总的训练样本数量是%d process:%0.2f" % (
R, count_of_train_data, (count / count_of_train_data) * 100) + "%", end="")
correct_rate = count_correct / count_of_train_data
print("\rR=%d 准确率:%0.5f" % (R, correct_rate))
work_sheet.cell(R + 1, 1, R)
work_sheet.cell(R + 1, 2, correct_rate)
work_book.save(RESULT_EXCEL_PATH)
_save_current_R(R)
work_book.save(RESULT_EXCEL_PATH)
def _save_current_R(R):
with open(CURRENT_R_BUFFER, 'w') as f:
f.write("R=%d" % R)
def _get_current_R():
try:
import current_R
return current_R.R
except:
return 1
# 加载 测试数据 和 训练样本数据
def load_mnist(path, kind='train'):
"""Load MNIST data from `path`"""
labels_path = os.path.join(path,
'%s-labels.idx1-ubyte'
% kind)
images_path = os.path.join(path,
'%s-images.idx3-ubyte'
% kind)
with open(labels_path, 'rb') as lbpath:
magic, n = struct.unpack('>II',
lbpath.read(8))
labels = np.fromfile(lbpath,
dtype=np.uint8)
with open(images_path, 'rb') as imgpath:
magic, num, rows, cols = struct.unpack('>IIII',
imgpath.read(16))
images = np.fromfile(imgpath,
dtype=np.uint8).reshape(len(labels), 784)
return images, labels
def matrix_printer(matrix):
r = len(matrix.shape)
if r == 1:
print(matrix)
else:
for i in matrix:
matrix_printer(i)
M = 2000 # sample count per train
RESULT_EXCEL_PATH = "RESULT_of_%dsamples.xlsx" % M
SHEET_NAME_RESULT_EXCEL = "PCA降维优化MNIST数据集识别"
CURRENT_R_BUFFER = 'current_R.py'
train_data_dir = "./MNIST数据集"
IMAGE_SHAPE = (28, 28)
if __name__ == "__main__":
train_samples, labels = load_mnist(train_data_dir) # train_sample是6000x748; labels 是 6000
N = train_samples.shape[0] # 特征数量的最大值 748
train_samples = train_samples[:M, :].T
labels = labels[:M].T
print("train_samples.shape:", train_samples.shape, "labels.shape:", labels.shape, "train_samples[1].shape:",
train_samples[1].shape)
train(train_samples, labels, N)
# img = images.T[:, 0].reshape(IMAGE_SHAPE)
# fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(4, 4),
# subplot_kw={'xticks': [], 'yticks': []},
# gridspec_kw=dict(hspace=0.1, wspace=0.1))
#
# ax0.imshow(img), ax0.set_title('test image')