PCA python实现
转载来自:https://blog.csdn.net/ws_developer/article/details/81634059
部分代码有修改。
1.理解PCA的几个背景概念:
坐标系:
基向量:
矩阵乘法:
变化:
特征:
方差:
协方差:
特征值分解:
2.使用sklearn.decomposition中PCA
使用降维前后特征输入到LR逻辑回归中验证效果。
from sklearn import datasets
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
digits = datasets.load_digits()
x = digits.data
y = digits.target
scores_before_pca = []
scores_after_pca = []
lr = LogisticRegression(penalty='l2', C=0.01)
lr.fit(x, y)
before_score = lr.score(x, y)
scores_before_pca.append(before_score)
print('lr.score before pca = ', before_score)
for k in range(1, 64):
pca = PCA(n_components=k)
pca.fit(x)
# print('pca.components_=', pca.components_)
# t = np.matmul(pca.components_[0], pca.components_[1])
# print('variance between 0 and 1 feature = ', t)
x_new = pca.fit_transform(x)
# print('x_new = ', x_new)
v = np.matmul(x_new[0], x_new[1])
# print('x_new variance between 0 and 1 feature = ', v)
# 测试降维后lr模型性能
lr.fit(x_new, y)
after_score = lr.score(x_new, y)
scores_after_pca.append(after_score)
# print('lr.score after pca = ', after_score)
plt.figure()
plt.plot(scores_before_pca, 'k', linewidth=10)
plt.plot(scores_after_pca, 's', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('score', fontsize=16)
plt.show()
'''
# 绘制降维后各特征方差值,即特征向量对应的特征值
plt.figure()
plt.plot(pca.explained_variance_, 'k', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('explained_variance', fontsize=16)
plt.show()
'''
3.自定义PCA实现
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
class DimensionValueError(ValueError):
pass
class PCA(object):
def __init__(self, x, n_components=None):
self.x = x
self.dimension = x.shape[1]
self.n_byratio = 0
if n_components and n_components >= self.dimension:
raise DimensionValueError("n_components error")
self.n_components = n_components
def cov(self):
x_T = np.transpose(self.x)
x_cov = np.cov(x_T)
return x_cov
def get_features(self):
x_cov = self.cov()
a, b = np.linalg.eig(x_cov)
# print('a', a)
# print('b', b)
m = a.shape[0]
c = np.hstack((a.reshape((m, 1)), b))
c_df = pd.DataFrame(c)
c_df_sort = c_df.sort_values(by=0, ascending=False)
return c_df_sort.values
def get_explained_varience(self):
x_cov = self.cov()
a, b = np.linalg.eig(x_cov)
return a
def reduce_dimension(self):
c_df_sort = self.get_features()
varience = self.get_explained_varience()
# print(varience)
if self.n_components:
p = c_df_sort.values[0:self.n_components, 1:] # 特征向量
p = np.transpose(p)
y = np.dot(self.x, p)
return y
varience_sum = sum(varience)
varience_ratio = varience / varience_sum
varience_contribution = 0
R = 0
for r in range(self.dimension):
varience_contribution += varience_ratio[r]
if varience_contribution >= 0.99:
R = r
break
self.n_byratio = R + 1
print('R', R+1)
# print('c_df_sort', c_df_sort)
p = c_df_sort[0: R+1, 1:]
# print('p', p)
p = np.transpose(p)
return np.dot(self.x, p)
# x = np.array([[1, 3, 4], [3, 4, 5], [3, 5, 7], [1, 4, 6]])
digits = load_digits()
x = digits.data
y = digits.target
pca = PCA(x)
f = pca.get_features()
# print(f)
x_new = pca.reduce_dimension()
print('x_new', x_new)
小改动:
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
digits = load_digits()
x1 = digits.data
y1 = digits.target
class ComponentsError(ValueError):
pass
class PCA(object):
def __init__(self, x, n_components=None):
self.x = x
m = self.x.shape[1]
self.n_components = n_components
if self.n_components and self.n_components > m:
return ComponentsError('n_components is more bigger!')
def cov(self):
x_T = np.transpose(self.x)
x_cov = np.cov(x_T) # 特征按列求协方差
return x_cov
def get_features(self):
x_cov = self.cov()
m = x_cov.shape[0]
v, p = np.linalg.eig(x_cov) # 特征向量p行排
f = np.hstack((v.reshape(m, 1), p))
f_df = pd.DataFrame(f)
f_df_sorted = f_df.sort_values(by=0, ascending=False) # 按特征值大小降序排序
return f_df_sorted.values
def reduce_dim(self):
f = self.get_features()
v = f[:, 0]
p = f[:, 1:]
# 按照指定维度进行降维
if self.n_components:
y = np.matmul(self.x, np.transpose(p[0:self.n_components]))
return y
v_all = sum(v)
v_ratio = v / v_all
print('v_ratio=', v_ratio)
R = 0
v_sum = 0.0
for r in range(len(v)):
v_sum += v_ratio[r]
if v_sum >= 0.99:
R = r
break
print('R=', R+1)
v_reduce_dim = v[0: R + 1]
p_reduce_dim = p[0: R + 1]
y = np.matmul(self.x, np.transpose(p_reduce_dim))
return y
pca = PCA(x1)
feats = pca.get_features()
print(feats)
x_new = pca.reduce_dim()
print('x_new', x_new)
# 绘制降维后各特征方差值,即特征向量对应的特征值
plt.figure()
plt.plot(feats, 'k', linewidth=2)
plt.xlabel('n_components', fontsize=16)
plt.ylabel('explained_variance', fontsize=16)
plt.show()