PCA降维算法_python实现
简单实现代码-python
import numpy as np
# 数据归一化
def featureNormalize(X):
# (每一个数据-当前列的均值)/当前列的标准差
n = X.shape[1] # 维数
mu = np.zeros((1,n))
sigma = np.zeros((1,n))
Y=np.zeros((X.shape[0],X.shape[1]))
mu = np.mean(X,axis=0) # 列平均值
sigma = np.std(X,axis=0) # 列标准差
for i in range(n):
Y[:,i] = (X[:,i] - mu[i]) / sigma[i]
return Y,mu,sigma
# 映射数据
def reduceDimensionData(X_norm,U,K):
# Z = np.zeros((X_norm.shape[0],K))
U_reduce = U[:,0:K] # 取前K个
Z = np.dot(X_norm,U_reduce)
return Z
def recoverData(Z,U,K):
# X_rec=np.zeros((Z.shape[0],U.shape[0]))
U_recude = U[:,0:K]
X_rec = np.dot(Z,np.transpose(U_recude)) # 还原数据(近似)
return X_rec
def PCA(X,K):
print("原始数据:")
print(X)
x_copy = X.copy()
x_norm,mu,sigma=featureNormalize(x_copy) #归一化
# x_norm = x_copy
print("归一化数据")
print(x_norm)
m = X.shape[0] # 样本个数
cov = np.dot(np.transpose(x_norm),x_norm) / m # 协方差
U,S,V = np.linalg.svd(cov) # cov的奇异值分解
print("S")
print(S)
Z = reduceDimensionData(x_norm,U,K) # 降维数据
print("降维数据")
print(Z)
X_rec = recoverData(Z,U,K) # 还原
# print(X_rec)
for i in range(X_rec.shape[1]):
X_rec[:,i] = (X_rec[:,i] *sigma[i])+mu[i]
print("降维后还原数据:")
print(X_rec)
sum = 0
for i in range(cov.shape[0]):
sum = sum + S[i]
ksum = 0
for i in range(K):
ksum = ksum + S[i]
print("贡献率:")
contriRatio = ksum / sum
print(contriRatio)
def test():
X = np.array([[-1,2,66,-1],[-2,6,58,-1],[-3,8,45,-2],
[1,9,36,1],[2,10,62,1],[3,5,83,2]]) # 导入数据,维度为4
PCA(X,2)
if __name__ == '__main__':
np.set_printoptions(suppress=True)
test()
结果:
原始数据:
[[-1 2 66 -1]
[-2 6 58 -1]
[-3 8 45 -2]
[ 1 9 36 1]
[ 2 10 62 1]
[ 3 5 83 2]]
归一化数据
[[-0.46291005 -1.73648628 0.50972854 -0.70710678]
[-0.9258201 -0.24806947 -0.02216211 -0.70710678]
[-1.38873015 0.49613894 -0.88648441 -1.41421356]
[ 0.46291005 0.86824314 -1.48486139 0.70710678]
[ 0.9258201 1.24034735 0.24378321 0.70710678]
[ 1.38873015 -0.62017367 1.63999617 1.41421356]]
S
[2.27496357 1.49208403 0.21930923 0.01364317]
降维数据
[[ 0.54316071 1.86902683]
[ 1.05637399 0.47581872]
[ 2.16552407 -0.37225405]
[-0.11661105 -1.73853588]
[-1.16084093 -1.12805905]
[-2.48760679 0.89400342]]
降维后还原数据:
[[-1.38796114 2.71908396 70.6695927 -1.04408654]
[-1.63490905 5.63833517 55.54993895 -1.08525997]
[-2.89740254 7.39507453 41.20533621 -1.82146769]
[ 0.74857579 10.32875385 44.35076001 0.6243176 ]
[ 2.00050576 9.07043478 56.26271813 1.37367778]
[ 3.17119118 4.84831772 81.961654 1.95281882]]
贡献率:
0.9417618999761952
利用sklearn.decomposition 里的 PCA实现
#coding=utf-8
import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1,2,66,-1], [-2,6,58,-1], [-3,8,45,-2], [1,9,36,1], [2,10,62,1], [3,5,83,2]]) #导入数据,维度为4
pca = PCA(n_components=2) #降到2维
pca.fit(X) #训练
newX=pca.fit_transform(X) #降维后的数据
# PCA(copy=True, n_components=2, whiten=False)
print("各个主成分的贡献率:")
print(pca.explained_variance_ratio_) #输出贡献率
print("原始数据:")
print(X)
print("降维后的数据:")
print(newX) #输出降维后的数据
print("降维后还原的数据:")
oldX=pca.inverse_transform(newX)
print(oldX) #输出降维后还原的数据
结果:
各个主成分的贡献率:
[0.95713353 0.03398198]
原始数据:
[[-1 2 66 -1]
[-2 6 58 -1]
[-3 8 45 -2]
[ 1 9 36 1]
[ 2 10 62 1]
[ 3 5 83 2]]
降维后的数据:
[[ 7.96504337 4.12166867]
[ -0.43650137 2.07052079]
[-13.63653266 1.86686164]
[-22.28361821 -2.32219188]
[ 3.47849303 -3.95193502]
[ 24.91311585 -1.78492421]]
降维后还原的数据:
[[-1.94037472 3.03463549 66.1814671 -1.37118938]
[-1.27479012 5.28296796 57.87329006 -0.852196 ]
[-2.04477761 6.73464855 44.78058024 -1.25592271]
[-0.10998941 10.48087847 36.25669113 0.11736839]
[ 2.61200674 9.04489278 61.8357845 1.72394933]
[ 2.75792512 5.42197676 83.07218697 1.63799036]]
实现原理参考下面的博客,他们都写得很详细
参考:
例子比较齐全,还有GitHub代码
解释比较详细:
这个比较简略