PCA降维为计算数据提供了很好的工具,下面,我们以人脸数据集(jaffe)为例,比较前后聚类kmeans采用PCA产生了明显的效果。
不啰嗦了,上代码:
from PIL import Image
import numpy as np
from numpy import *
def load_image_PIL():
import os
from PIL import Image
imgs=os.listdir('d:/learning/pcaandknn')
num=len(imgs)
dataMat=zeros((num,256*256))
for i in range(num):
img=Image.open('d:/learning/pcaandknn/'+imgs[i])
img_arr=np.array(np.array(img,dtype='float32').flatten())
dataMat[i,:]=img_arr
print(dataMat,shape(dataMat))
return dataMat
def Modul_pca_front_fit(X):
from sklearn.cluster import KMeans
from sklearn.externals import joblib
clf=KMeans(n_clusters=10)
clf.fit(X)
# print(clf.cluster_centers_)
print(clf.labels_)
joblib.dump(clf,"d:\\km.pkl")
clf=joblib.load("d:\\km.pkl")
# print(clf)
def Modul_pca_after_fit(X):
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.decomposition import PCA
pca=PCA(0.95)
pca.fit(X)
X_reduction=pca.transform(X)
print(shape(X_reduction))
return X_reduction
# print(pca.n_components)
# print(pca.n_components_)
X=load_image_PIL()
%time Modul_pca_front_fit(X) #11.1 s
%time X=Modul_pca_after_fit(X) #2.73 s
%time Modul_pca_front_fit(X) #60 ms
降维后,再执行聚类算法,只需要60ms,速度明显啊。并且维度从65536降到了85维度,是不是很激动啊。