机器学习实战之PCA

from numpy import *

def loadDataSet(fileName, delim = '\t'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    datArr = [map(float,line) for line in stringArr]
    return mat(datArr)

def pca(dataMat, topNfeat = 9999999):
    meanVals = mean(dataMat , axis = 0)
    meanRemoved = dataMat - meanVals
    covMat = cov(meanRemoved, rowvar = 0)
    eigVals, eigVects = linalg.eig(mat(covMat)) #the eigVect is a column vector,eigVects[:,i] represent the eigenvector of eigenvalue[i]
    eigValInd = argsort(eigVals) #Returns the indices that would sort an array.
    eigValInd = eigValInd[:-(topNfeat+1):-1] #reverse sort
    redEigVects = eigVects[:,eigValInd] #sorted eigenvectores
    lowDDataMat = meanRemoved * redEigVects #lowDDataMat is the projected length along the redEigVect
    reconMat = (lowDDataMat * redEigVects.T) + meanVals #lowDDataMat * redEigVects.T are projected nodes site on the redEigVect
    return lowDDataMat, reconMat

# dataMat = loadDataSet('testSet.txt', '\t')
# lowDMat, reconMat = pca(dataMat,1)
# print lowDMat,reconMat
# import matplotlib
# import matplotlib.pyplot as plt
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],marker='^',s=90)
# ax.scatter(reconMat[:,0].flatten().A[0],reconMat[:,1].flatten().A[0],marker='o',s=90,c='r')
# plt.show()

def replaceNanWithMean():
    datMat = loadDataSet('secom.data',' ')
    numFeat = shape(datMat)[1]
    for i in range(numFeat):
        meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i])
        datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal
    return datMat

# dataMat = replaceNanWithMean()
# print dataMat

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值