协方差矩阵的特征向量为数据传播的方向,特征值为该方向上的幅度。
第二大特征向量正交于最大特征向量,指向第二大数据传播方向。
pca.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
#coding=utf-8
from numpy import *
def loadDataSet(fileName, delim='\t'):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
datArr = [map(float, line) for line in stringArr]
return mat(datArr)
#PCA算法
def pca(dataMat, topNfeat=9999999):
meanVals = mean(dataMat, axis = 0)
meanRemoved = dataMat - meanVals #去除平均值
covMat = cov(meanRemoved, rowvar = 0) #计算协方差矩阵
eigVals, eigVects = linalg.eig(mat(covMat))#计算协方差矩阵的特征值和特征向量
eigValInd = argsort(eigVals) #将特征值从小到大排序
eigValInd = eigValInd[: -(topNfeat+1): -1] #保留最大的N个特征向量
redEigVects = eigVects[:, eigValInd]
#将数据转换到上述N个特征向量构建的新空间中
lowDataMat = meanRemoved * redEigVects
reconMat = (lowDataMat * redEigVects.T) + meanVals
return lowDataMat, reconMat
测试:
>>> import pca
>>> dataMat = loadDataSet('testSet.txt')
>>> shape(dataMat)
(1000, 2) #原始数据为二维矩阵
>>> lowDMat, reconMat = pca.pca(dataMat, 1)
>>> shape(lowDMat)
(1000, 1) #降为一维
#绘制图像
>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^', s=90)
<matplotlib.collections.PathCollection object at 0x049DBFB0>
>>> ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0], marker='^', s=50, c='red')
<matplotlib.collections.PathCollection object at 0x04B4AB70>
>>> fig.show()