import numpy as np
class PCA:
def __init__(self, fileName, splitBy=' '):
self.readData = np.array([line.split(splitBy) for line in open(fileName).readlines()], dtype='float') # 读取数据为numpy矩阵
nan_index = np.where(np.isnan(self.readData)) # 获取Nan值的坐标
self.readData[nan_index] = np.take(np.nanmean(self.readData, axis=0), nan_index[1]) # 将Nan值替换为列均值
self.initData = self.readData - self.readData.mean(axis=0) # 得到去均值的数据矩阵
self.Cov = np.cov(self.initData.T) # 计算其协方差矩阵
self.EValue, self.EVector = np.linalg.eig(self.Cov) # 计算其特征值和特征向量
def RecudeDimension(self, dim): # 计算降维后的数据,前dim总方差贡献率,前dim各方差贡献率
return np.dot(self.initData, self.EVector[:, :dim]), \
[self.EValue[i]/sum(self.EValue) for i in range(dim)], \
sum(self.EValue[:dim])/sum(self.EValue)
if __name__ == '__main__':
pca = PCA('./PCA/secom.data')
data, vals, vals_sum = pca.RecudeDimension(20)
print('降维后的数据:', data)
print('前20个特征的方差贡献率:', vals)
print('前20个特征的总方差贡献率:', vals_sum)
数据集链接:(https://pan.baidu.com/s/1BB3TRVvK-BALs_Uwe0ejIw
提取码:tdlp