PCA算法python实现,解析几何,曲线拟合,机器学习,图像识别

PCA算法

PCA算法及其原理介绍

在上一个文章PCA 算法原理与应用-解析集合-曲线拟合-机器学习图像识别(点这里)详细的介绍了 PCA算法的数学解释,但是代码不健全,这次代码专门发布了一下以供大家参考。

PCA算法python实现

import os
import struct
import sys
import numpy as np
import openpyxl
from xlwt import Workbook


# 获取均值矩阵 和 零均值化后的数据矩阵
def get_mean_matrix(Data):
    m = Data.mean(axis=1)  # 计算均值, axis = 1代表对各行求均值
    m = m.reshape(m.shape[0], 1)
    zm = Data - m  # 把均值变为0
    return m, zm


# 获取协方差矩阵
def get_convariance_matrix(zm):
    if zm.shape[0] > zm.shape[1]:
        # 由于样本特征维度远远大于样本数量,采用改进方法
        C = np.matmul(zm.T, zm)
        print("[(Data-Data.mean).T*(Data - Data.mean)].shape:", C.shape)
        # print("[(Data-Data.mean).T*(Data - Data.mean)]:")
    else:
        C = np.dot(zm, zm.T)
        print("[(Data-Data.mean)*(Data - Data.mean).T].shape:", C.shape)
        # print("[(Data-Data.mean)*(Data - Data.mean).T]:")
    C = C / zm.shape[1]
    return C


# 获取 特征值矩阵,特征向量矩阵
def get_sorted_eign_vectors(cm):
    eigenvalue, eigenvectors = np.linalg.eig(cm)  # D是特征值,V为特征向量
    Ind = np.argsort(eigenvalue)  # 根据 特征值的大小排序 得到排序
    Ind = Ind[::-1]
    sorted_column_index = Ind[:eigenvectors.shape[0]]
    sorted_eigenvectors = eigenvectors[:, sorted_column_index]
    return sorted_eigenvectors


# 获取数据矩阵在新的基空间上的投影
def get_data_in_new_space(Data, new_space):
    Data_in_new_space = np.dot(new_space, Data)  # 所有样本数据降维后的表示 (Rx样本数量)
    return Data_in_new_space


# 获取降维空间基,降维后的数据矩阵
def get_reduced_base_space(eigenVectors, R):
    reduced_base_space = eigenVectors[:, : R].T  # 取R最大特征值对应的特征来降维后的及空间矩阵
    return reduced_base_space


# 将测试数据投影到降了维的新基空间,找出与最为匹配的数据,并返回其标签
def recognize(testImageArray, reduced_base_space, m, labels, new_data):
    # 可解决中文路径不能打开问题
    testImageArray = np.mat(np.array(testImageArray))  # from [[a],[b],...[c]] to [[[x]],[[x]],...]
    differenceTestImage = testImageArray - m  # 零均值化
    # 5、加载一个测试图片,并利用C矩阵也把其投影为test_P
    projectedTestImage = get_data_in_new_space(differenceTestImage, reduced_base_space)  # 映射到新的R个基所描述的空间
    # 6、计算test_P和P中每个样本的距离,选出最近的那个即可
    index_close = 0
    min_distance = sys.float_info.max
    for i in range(0, new_data.shape[1]):
        q = new_data[:, i]
        q = q.reshape(q.shape[0], 1)
        distance = np.linalg.norm(projectedTestImage - q)
        if distance < min_distance:
            min_distance = distance
            index_close = i
    return labels[index_close]


# 对单一样本进行识别测试
def test(test_label, Data, labels, index_label, reduced_base_space, m, Y):
    testImageArray = Data[:, index_label]
    testImageArray = testImageArray.reshape(testImageArray.shape[0], 1)
    result_label = recognize(testImageArray, reduced_base_space, m, labels, Y)
    # print(index_label, "test_label:", test_label, "result_label:", result_label, test_label == result_label)
    return result_label == test_label


# 对6000个训练样本的数据集进行测试,并记录每一个维度上的识别正确率,并把结果保存在excel文件中。
def train(Data, labels, N):
    ws = Workbook(encoding="utf-8")
    if os.path.exists(RESULT_EXCEL_PATH):
        work_book = openpyxl.load_workbook(RESULT_EXCEL_PATH)
        work_sheet = work_book.get_sheet_by_name(SHEET_NAME_RESULT_EXCEL)
    else:
        work_book = openpyxl.Workbook()  # 目录上不存在 xls 文件时,新建一个
        _save_current_R(1)

        default_sheet = work_book.get_sheet_by_name("Sheet")
        work_book.remove(default_sheet)  # 删除默认的Sheet

        work_sheet = work_book.create_sheet(SHEET_NAME_RESULT_EXCEL)

        work_sheet.cell(1, 1, "PCA维数")
        work_sheet.cell(1, 2, "识别准确率")
        work_book.save(RESULT_EXCEL_PATH)
    print(work_sheet.title)
    mean_matrix, zm = get_mean_matrix(Data)
    cm = get_convariance_matrix(zm)
    sorted_eigen_vectors = get_sorted_eign_vectors(cm)
    current_R = _get_current_R()
    count_of_train_data = labels.shape[0]
    print("每一维上都进行测试的训练样本数据量%s个" % count_of_train_data)
    for R in range(current_R, N):
        reduced_base_space = get_reduced_base_space(sorted_eigen_vectors, R)
        new_data = get_data_in_new_space(Data, reduced_base_space)
        count_correct = 0
        count = 0
        for i in range(0, count_of_train_data):
            if test(labels[i], Data, labels, i, reduced_base_space, mean_matrix, new_data):
                count_correct += 1
            count += 1
            print("\r维度R=%d 正在进行测试数据,计算识别正确率,总的训练样本数量是%d   process:%0.2f" % (
                R, count_of_train_data, (count / count_of_train_data) * 100) + "%", end="")
        correct_rate = count_correct / count_of_train_data
        print("\rR=%d 准确率:%0.5f" % (R, correct_rate))
        work_sheet.cell(R + 1, 1, R)
        work_sheet.cell(R + 1, 2, correct_rate)
        work_book.save(RESULT_EXCEL_PATH)
        _save_current_R(R)
    work_book.save(RESULT_EXCEL_PATH)


def _save_current_R(R):
    with open(CURRENT_R_BUFFER, 'w') as f:
        f.write("R=%d" % R)


def _get_current_R():
    try:
        import current_R
        return current_R.R
    except:
        return 1


# 加载 测试数据 和 训练样本数据
def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels.idx1-ubyte'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images.idx3-ubyte'
                               % kind)
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II',
                                 lbpath.read(8))
        labels = np.fromfile(lbpath,
                             dtype=np.uint8)

    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII',
                                               imgpath.read(16))
        images = np.fromfile(imgpath,
                             dtype=np.uint8).reshape(len(labels), 784)

    return images, labels


def matrix_printer(matrix):
    r = len(matrix.shape)
    if r == 1:
        print(matrix)
    else:
        for i in matrix:
            matrix_printer(i)


M = 2000  # sample count per train
RESULT_EXCEL_PATH = "RESULT_of_%dsamples.xlsx" % M
SHEET_NAME_RESULT_EXCEL = "PCA降维优化MNIST数据集识别"
CURRENT_R_BUFFER = 'current_R.py'
train_data_dir = "./MNIST数据集"
IMAGE_SHAPE = (28, 28)
if __name__ == "__main__":
    train_samples, labels = load_mnist(train_data_dir)  # train_sample是6000x748;   labels 是 6000
    N = train_samples.shape[0]  # 特征数量的最大值  748
    train_samples = train_samples[:M, :].T
    labels = labels[:M].T
    print("train_samples.shape:", train_samples.shape, "labels.shape:", labels.shape, "train_samples[1].shape:",
          train_samples[1].shape)
    train(train_samples, labels, N)

    # img = images.T[:, 0].reshape(IMAGE_SHAPE)
    # fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(4, 4),
    #                                subplot_kw={'xticks': [], 'yticks': []},
    #                                gridspec_kw=dict(hspace=0.1, wspace=0.1))
    #
    # ax0.imshow(img), ax0.set_title('test image')

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

@Sadam

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值