主成分思想还是较于简单,跟LDA线性判别方法可以做做比较,昨天下午到今天上午在python里面写了一下,代码主要参考的是机器学习实战那本书,代码具体如下:
#encoding:utf-8
'''
Created on 2015年9月23日
@author: ZHOUMEIXU204
'''
path=u'D:\\Users\\zhoumeixu204\\Desktop\\python语言机器学习\\机器学习实战代码 python\\机器学习实战代码\\machinelearninginaction\\Ch13\\'
import numpy as np
import matplotlib.pyplot as plt
def loadDataSet(filename,delim='\t'):
fr=open(filename)
StringArr=[line.strip().split(delim) for line in fr.readlines()]
datArr=[map(float,line) for line in StringArr]
return np.mat(datArr)
def pca(dataMat, topNfeat=9999999):
meanVals = np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals #remove mean
covMat = np.cov(meanRemoved, rowvar=0) #寻找方差最大的方向a,Var(a'X)=a'Cov(X)a方向误差最大
eigVals,eigVects = np.linalg.eig(np.mat(covMat))
eigValInd =np.argsort(eigVals) #sort, sort goes smallest to largest
eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions
redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest
lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat, reconMat
dataMat=loadDataSet(path+'testSet.txt')
print dataMat
lowDMat,recoMat=pca(dataMat,1)
print(u'特征值是:')
print(lowDMat)
print(u'特征向量是:')
print(recoMat)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(dataMat[:,0], dataMat[:,1], marker='^', s=90)
ax.scatter(recoMat[:,0], recoMat[:,1], marker='o', s=50, c='red')
plt.show()
def replaceNanWithMean():
datMat=loadDataSet(path+'secom.data',' ')
numFeat=np.shape(datMat)[1]
for i in range(numFeat):
meanVal=np.mean(datMat[np.nonzero(~np.isnan(datMat[:,i].A))[0],i])
datMat[np.nonzero(np.isnan(datMat[:,i].A))[0],i]=meanVal
return datMat
dataMat=replaceNanWithMean()
meanVals =np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals #remove mean
covMat = np.cov(meanRemoved, rowvar=0)
eigVals,eigVects = np.linalg.eig(np.mat(covMat))
eigValInd = np.argsort(eigVals) #sort, sort goes smallest to largest
eigValInd = eigValInd[::-1]#reverse
sortedEigVals = eigVals[eigValInd]
total = sum(sortedEigVals)
varPercentage = sortedEigVals/total*100
#计算主成分方差
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(range(1, 21), varPercentage[:20], marker='^')
plt.xlabel('Principal Component Number')
plt.ylabel('Percentage of Variance')
plt.show()