转载PCA python实现

PCA python实现

转载来自:https://blog.csdn.net/ws_developer/article/details/81634059
部分代码有修改。

1.理解PCA的几个背景概念:

坐标系:
基向量:
矩阵乘法:
变化:
特征:
方差:
协方差:
特征值分解:

2.使用sklearn.decomposition中PCA

使用降维前后特征输入到LR逻辑回归中验证效果。

from sklearn import datasets
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
import numpy as np
from sklearn.linear_model import LogisticRegression

digits = datasets.load_digits()
x = digits.data
y = digits.target

scores_before_pca = []
scores_after_pca = []

lr = LogisticRegression(penalty='l2', C=0.01)
lr.fit(x, y)
before_score = lr.score(x, y)
scores_before_pca.append(before_score)
print('lr.score before pca = ', before_score)

for k in range(1, 64):
    pca = PCA(n_components=k)
    pca.fit(x)
    # print('pca.components_=', pca.components_)
    # t = np.matmul(pca.components_[0], pca.components_[1])
    # print('variance between 0 and 1 feature = ', t)

    x_new = pca.fit_transform(x)
    # print('x_new = ', x_new)
    v = np.matmul(x_new[0], x_new[1])
    # print('x_new variance between 0 and 1 feature = ', v)

    # 测试降维后lr模型性能
    lr.fit(x_new, y)
    after_score = lr.score(x_new, y)
    scores_after_pca.append(after_score)
    # print('lr.score after pca = ', after_score)

plt.figure()
plt.plot(scores_before_pca, 'k', linewidth=10)
plt.plot(scores_after_pca, 's', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('score', fontsize=16)
plt.show()

'''
# 绘制降维后各特征方差值,即特征向量对应的特征值
plt.figure()
plt.plot(pca.explained_variance_, 'k', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('explained_variance', fontsize=16)
plt.show()
'''

3.自定义PCA实现

import numpy as np
import pandas as pd
from sklearn.datasets import load_digits


class DimensionValueError(ValueError):
    pass


class PCA(object):
    def __init__(self, x, n_components=None):
        self.x = x
        self.dimension = x.shape[1]
        self.n_byratio = 0
        if n_components and n_components >= self.dimension:
            raise DimensionValueError("n_components error")

        self.n_components = n_components

    def cov(self):
        x_T = np.transpose(self.x)
        x_cov = np.cov(x_T)
        return x_cov

    def get_features(self):
        x_cov = self.cov()
        a, b = np.linalg.eig(x_cov)
        # print('a', a)
        # print('b', b)
        m = a.shape[0]
        c = np.hstack((a.reshape((m, 1)), b))
        c_df = pd.DataFrame(c)
        c_df_sort = c_df.sort_values(by=0, ascending=False)
        return c_df_sort.values

    def get_explained_varience(self):
        x_cov = self.cov()
        a, b = np.linalg.eig(x_cov)
        return a

    def reduce_dimension(self):
        c_df_sort = self.get_features()
        varience = self.get_explained_varience()
        # print(varience)
        if self.n_components:
            p = c_df_sort.values[0:self.n_components, 1:]  # 特征向量
            p = np.transpose(p)
            y = np.dot(self.x, p)
            return y

        varience_sum = sum(varience)
        varience_ratio = varience / varience_sum

        varience_contribution = 0
        R = 0
        for r in range(self.dimension):
            varience_contribution += varience_ratio[r]
            if varience_contribution >= 0.99:
                R = r
                break
        self.n_byratio = R + 1
        print('R', R+1)
        # print('c_df_sort', c_df_sort)
        p = c_df_sort[0: R+1, 1:]
        # print('p', p)
        p = np.transpose(p)
        return np.dot(self.x, p)


# x = np.array([[1, 3, 4], [3, 4, 5], [3, 5, 7], [1, 4, 6]])
digits = load_digits()
x = digits.data
y = digits.target
pca = PCA(x)
f = pca.get_features()
# print(f)

x_new = pca.reduce_dimension()
print('x_new', x_new)

小改动:

from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
import matplotlib.pylab as plt
import pandas as pd
import numpy as np

digits = load_digits()
x1 = digits.data
y1 = digits.target


class ComponentsError(ValueError):
    pass


class PCA(object):
    def __init__(self, x, n_components=None):
        self.x = x
        m = self.x.shape[1]
        self.n_components = n_components
        if self.n_components and self.n_components > m:
            return ComponentsError('n_components is more bigger!')

    def cov(self):
        x_T = np.transpose(self.x)
        x_cov = np.cov(x_T)  # 特征按列求协方差
        return x_cov

    def get_features(self):
        x_cov = self.cov()
        m = x_cov.shape[0]
        v, p = np.linalg.eig(x_cov)  # 特征向量p行排
        f = np.hstack((v.reshape(m, 1), p))
        f_df = pd.DataFrame(f)
        f_df_sorted = f_df.sort_values(by=0, ascending=False)  # 按特征值大小降序排序
        return f_df_sorted.values

    def reduce_dim(self):
        f = self.get_features()

        v = f[:, 0]
        p = f[:, 1:]
        # 按照指定维度进行降维
        if self.n_components:
            y = np.matmul(self.x, np.transpose(p[0:self.n_components]))
            return y
        v_all = sum(v)
        v_ratio = v / v_all
        print('v_ratio=', v_ratio)
        R = 0
        v_sum = 0.0
        for r in range(len(v)):
            v_sum += v_ratio[r]
            if v_sum >= 0.99:
                R = r
                break
        print('R=', R+1)
        v_reduce_dim = v[0: R + 1]
        p_reduce_dim = p[0: R + 1]
        y = np.matmul(self.x, np.transpose(p_reduce_dim))
        return y


pca = PCA(x1)
feats = pca.get_features()
print(feats)

x_new = pca.reduce_dim()
print('x_new', x_new)


# 绘制降维后各特征方差值,即特征向量对应的特征值
plt.figure()
plt.plot(feats, 'k', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('explained_variance', fontsize=16)
plt.show()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,下面是关于 PLS 和 PCAPython 实现案例。 首先,我们需要导入必要的库,使用的是 scikit-learn 库: ```python from sklearn.cross_decomposition import PLSRegression from sklearn.decomposition import PCA from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import numpy as np ``` 接下来,我们使用 PLS 和 PCA 对糖尿病数据集进行降维和建模,并计算均方误差(MSE): ```python # 加载糖尿病数据集 diabetes = load_diabetes() X = diabetes.data y = diabetes.target # 将数据集分为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 使用 PLS 进行建模并计算 MSE pls = PLSRegression(n_components=3) pls.fit(X_train, y_train) y_pred_pls = pls.predict(X_test) mse_pls = mean_squared_error(y_test, y_pred_pls) print("MSE with PLS: ", mse_pls) # 使用 PCA 进行建模并计算 MSE pca = PCA(n_components=3) pca.fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) pls.fit(X_train_pca, y_train) y_pred_pca = pls.predict(X_test_pca) mse_pca = mean_squared_error(y_test, y_pred_pca) print("MSE with PCA-PLS: ", mse_pca) ``` 输出结果如下: ``` MSE with PLS: 3413.036870501397 MSE with PCA-PLS: 3427.601194414032 ``` 可以看到,使用 PLS 和 PCA 进行降维后建模,均方误差非常接近。 希望这个示例对您有所帮助。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值