python中实现PCA

主成分思想还是较于简单,跟LDA线性判别方法可以做做比较,昨天下午到今天上午在python里面写了一下,代码主要参考的是机器学习实战那本书,代码具体如下:

#encoding:utf-8
'''
Created on 2015年9月23日
@author: ZHOUMEIXU204
'''

path=u'D:\\Users\\zhoumeixu204\\Desktop\\python语言机器学习\\机器学习实战代码   python\\机器学习实战代码\\machinelearninginaction\\Ch13\\'
import  numpy as np
import matplotlib.pyplot  as plt
def  loadDataSet(filename,delim='\t'):
    fr=open(filename)
    StringArr=[line.strip().split(delim) for line  in fr.readlines()]
    datArr=[map(float,line) for line in StringArr]
    return np.mat(datArr)


def pca(dataMat, topNfeat=9999999):
    meanVals = np.mean(dataMat, axis=0)     
    meanRemoved = dataMat - meanVals #remove mean   
    covMat = np.cov(meanRemoved, rowvar=0)    #寻找方差最大的方向a,Var(a'X)=a'Cov(X)a方向误差最大
    eigVals,eigVects = np.linalg.eig(np.mat(covMat))
    eigValInd =np.argsort(eigVals)            #sort, sort goes smallest to largest
    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest
    lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat, reconMat

dataMat=loadDataSet(path+'testSet.txt')
print dataMat
lowDMat,recoMat=pca(dataMat,1)
print(u'特征值是:')
print(lowDMat)
print(u'特征向量是:')
print(recoMat)
    
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(dataMat[:,0], dataMat[:,1], marker='^', s=90)
ax.scatter(recoMat[:,0], recoMat[:,1], marker='o', s=50, c='red')
plt.show()
    
    
    
def   replaceNanWithMean():
    datMat=loadDataSet(path+'secom.data',' ')
    numFeat=np.shape(datMat)[1]
    for  i in range(numFeat):
        meanVal=np.mean(datMat[np.nonzero(~np.isnan(datMat[:,i].A))[0],i])
        datMat[np.nonzero(np.isnan(datMat[:,i].A))[0],i]=meanVal
    return datMat
dataMat=replaceNanWithMean()
meanVals =np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals #remove mean
covMat = np.cov(meanRemoved, rowvar=0)
eigVals,eigVects = np.linalg.eig(np.mat(covMat))
eigValInd = np.argsort(eigVals)            #sort, sort goes smallest to largest
eigValInd = eigValInd[::-1]#reverse
sortedEigVals = eigVals[eigValInd]
total = sum(sortedEigVals)
varPercentage = sortedEigVals/total*100
#计算主成分方差
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(range(1, 21), varPercentage[:20], marker='^')
plt.xlabel('Principal Component Number')
plt.ylabel('Percentage of Variance')
plt.show()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值