参考:利用python的KMeans和PCA包实现聚类算法 - 鸿燕藏锋 - 博客园
2.csv随便填了几列简单的数据
# encoding: utf-8
from pandas import DataFrame, Series
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
df = pd.read_csv("./data/2.csv",low_memory=False)
#df1=df.ix[:,2:]
#print(df)
#print(df1)
kmeans = KMeans(n_clusters=3,random_state=10).fit(df)
df['label']=kmeans.labels_
df_count_type=df.groupby('label').apply(np.size)
#各类别数目
df_count_type
#聚类中心
kmeans.cluster_centers_
##新的dataframe,命名为new_df ,并输出到本地,命名为new_df.csv。
new_df=df[:]
new_df
new_df.to_csv('new_df.csv')
##将用于聚类的数据的特征的维度降至2维,并输出降维后的数据,形成一个dataframe名字new_pca
pca = PCA(n_components=2)
new_pca = pd.DataFrame(pca.fit_transform(new_df))
##可视化
d = new_pca[new_df['label'] == 0]
plt.plot(d[0], d[1], 'r.')
d = new_pca[new_df['label'] == 1]
plt.plot(d[0], d[1], 'go')
d = new_pca[new_df['label'] == 2]
plt.plot(d[0], d[1], 'b*')
plt.gcf().savefig('kmeans.png')
plt.show()