#学习自机器学习实战
from numpy import *
def loadDataSet(fileName,delim='\t'):
fr=open(fileName)
dataMat=[]
for line in fr.readlines():
curline=line.strip().split(delim)
#print(curline)
dataarr=[]
n=len(curline)
for j in range(n):
dataarr.append(float(curline[j]))
dataMat.append(dataarr)
return dataMat
# stringArr=[line.strip().split(delim) for line in fr.readlines()]
# datArr=[map(float,line) for line in stringArr]
# return mat(datArr)
def pca(dataMat,topNfeat=999999):
meanVals=mean(dataMat,axis=0)#meanval
meanRemoved=dataMat-meanVals#decrease meanval
covMat=cov(meanRemoved,rowvar=0)#covariance
eigVals,eigVects=linalg.eig(mat(covMat))#characteristic value and vector
print("eigval:" ,eigVals)
print("eigVects" ,eigVects)
eigValInd=argsort(eigVals)#sort and list the serial number
print("befor" ,eigValInd)
eigValInd=eigValInd[:-(topNfeat+1):-1]#reverse the eigValInd
print("after:")
print(eigValInd)
redEigVects=eigVects[:,eigValInd]
print("redEigVects" ,redEigVects)
lowDateMat=meanRemoved*redEigVects
reconMat=(lowDateMat*redEigVects.T)+meanVals#restore
return lowDateMat,reconMat
if __name__=='__main__':
x=loadDataSet('cdata.txt')
print(pca(x))