转载PCA python实现_pcapython实现demo-CSDN博客

博客主要介绍PCA的Python实现。先阐述理解PCA的几个背景概念，如坐标系、基向量等；接着讲解使用sklearn.decomposition中的PCA，并将降维前后特征输入LR逻辑回归验证效果；最后介绍自定义PCA实现，有小改动。

PCA python实现

转载来自：https://blog.csdn.net/ws_developer/article/details/81634059
部分代码有修改。

1.理解PCA的几个背景概念：

坐标系：
基向量：
矩阵乘法：
变化：
特征：
方差：
协方差：
特征值分解：

2.使用sklearn.decomposition中PCA

使用降维前后特征输入到LR逻辑回归中验证效果。

from sklearn import datasets
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
import numpy as np
from sklearn.linear_model import LogisticRegression

digits = datasets.load_digits()
x = digits.data
y = digits.target

scores_before_pca = []
scores_after_pca = []

lr = LogisticRegression(penalty='l2', C=0.01)
lr.fit(x, y)
before_score = lr.score(x, y)
scores_before_pca.append(before_score)
print('lr.score before pca = ', before_score)

for k in range(1, 64):
    pca = PCA(n_components=k)
    pca.fit(x)
    # print('pca.components_=', pca.components_)
    # t = np.matmul(pca.components_[0], pca.components_[1])
    # print('variance between 0 and 1 feature = ', t)

    x_new = pca.fit_transform(x)
    # print('x_new = ', x_new)
    v = np.matmul(x_new[0], x_new[1])
    # print('x_new variance between 0 and 1 feature = ', v)

    # 测试降维后lr模型性能
    lr.fit(x_new, y)
    after_score = lr.score(x_new, y)
    scores_after_pca.append(after_score)
    # print('lr.score after pca = ', after_score)

plt.figure()
plt.plot(scores_before_pca, 'k', linewidth=10)
plt.plot(scores_after_pca, 's', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('score', fontsize=16)
plt.show()

'''
# 绘制降维后各特征方差值，即特征向量对应的特征值
plt.figure()
plt.plot(pca.explained_variance_, 'k', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('explained_variance', fontsize=16)
plt.show()
'''

3.自定义PCA实现

import numpy as np
import pandas as pd
from sklearn.datasets import load_digits


class DimensionValueError(ValueError):
    pass


class PCA(object):
    def __init__(self, x, n_components=None):
        self.x = x
        self.dimension = x.shape[1]
        self.n_byratio = 0
        if n_components and n_components >= self.dimension:
            raise DimensionValueError("n_components error")

        self.n_components = n_components

    def cov(self):
        x_T = np.transpose(self.x)
        x_cov = np.cov(x_T)
        return x_cov

    def get_features(self):
        x_cov = self.cov()
        a, b = np.linalg.eig(x_cov)
        # print('a', a)
        # print('b', b)
        m = a.shape[0]
        c = np.hstack((a.reshape((m, 1)), b))
        c_df = pd.DataFrame(c)
        c_df_sort = c_df.sort_values(by=0, ascending=False)
        return c_df_sort.values

    def get_explained_varience(self):
        x_cov = self.cov()
        a, b = np.linalg.eig(x_cov)
        return a

    def reduce_dimension(self):
        c_df_sort = self.get_features()
        varience = self.get_explained_varience()
        # print(varience)
        if self.n_components:
            p = c_df_sort.values[0:self.n_components, 1:]  # 特征向量
            p = np.transpose(p)
            y = np.dot(self.x, p)
            return y

        varience_sum = sum(varience)
        varience_ratio = varience / varience_sum

        varience_contribution = 0
        R = 0
        for r in range(self.dimension):
            varience_contribution += varience_ratio[r]
            if varience_contribution >= 0.99:
                R = r
                break
        self.n_byratio = R + 1
        print('R', R+1)
        # print('c_df_sort', c_df_sort)
        p = c_df_sort[0: R+1, 1:]
        # print('p', p)
        p = np.transpose(p)
        return np.dot(self.x, p)


# x = np.array([[1, 3, 4], [3, 4, 5], [3, 5, 7], [1, 4, 6]])
digits = load_digits()
x = digits.data
y = digits.target
pca = PCA(x)
f = pca.get_features()
# print(f)

x_new = pca.reduce_dimension()
print('x_new', x_new)

小改动：

from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
import matplotlib.pylab as plt
import pandas as pd
import numpy as np

digits = load_digits()
x1 = digits.data
y1 = digits.target


class ComponentsError(ValueError):
    pass


class PCA(object):
    def __init__(self, x, n_components=None):
        self.x = x
        m = self.x.shape[1]
        self.n_components = n_components
        if self.n_components and self.n_components > m:
            return ComponentsError('n_components is more bigger!')

    def cov(self):
        x_T = np.transpose(self.x)
        x_cov = np.cov(x_T)  # 特征按列求协方差
        return x_cov

    def get_features(self):
        x_cov = self.cov()
        m = x_cov.shape[0]
        v, p = np.linalg.eig(x_cov)  # 特征向量p行排
        f = np.hstack((v.reshape(m, 1), p))
        f_df = pd.DataFrame(f)
        f_df_sorted = f_df.sort_values(by=0, ascending=False)  # 按特征值大小降序排序
        return f_df_sorted.values

    def reduce_dim(self):
        f = self.get_features()

        v = f[:, 0]
        p = f[:, 1:]
        # 按照指定维度进行降维
        if self.n_components:
            y = np.matmul(self.x, np.transpose(p[0:self.n_components]))
            return y
        v_all = sum(v)
        v_ratio = v / v_all
        print('v_ratio=', v_ratio)
        R = 0
        v_sum = 0.0
        for r in range(len(v)):
            v_sum += v_ratio[r]
            if v_sum >= 0.99:
                R = r
                break
        print('R=', R+1)
        v_reduce_dim = v[0: R + 1]
        p_reduce_dim = p[0: R + 1]
        y = np.matmul(self.x, np.transpose(p_reduce_dim))
        return y


pca = PCA(x1)
feats = pca.get_features()
print(feats)

x_new = pca.reduce_dim()
print('x_new', x_new)


# 绘制降维后各特征方差值，即特征向量对应的特征值
plt.figure()
plt.plot(feats, 'k', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('explained_variance', fontsize=16)
plt.show()