Kmeans将聚类结果对应原始数据保存起来
原始数据样式:
# K-Means Clustering
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
from scipy.cluster.vq import whiten
path = 'D:\\data\\cnndata\\order.csv'
dataset = pd.read_csv(path,header=None)
X = dataset.iloc[:, :].values
# y = dataset.iloc[:, 3].values
# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 15):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42 )
kmeans.fit(X[:,6:9])
wcss.append(kmeans.inertia_)
plt.plot(range(1, 15), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
# Fitting K-Means to the dataset
# K-means本身不能解决 Random Initialization Trap,但是K-means++使用wcss算法用n_init参数能解决
kmeans = KMeans(n_clusters = 4 , init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit((X[:,6:9])) #将元数据 6-9列喂kmeans
#将kmeans对应聚类簇为0的数据选出来
A = X[y_kmeans.labels_ == 0,:]
# a = np.zeros(pd.Series(y_kmeans.labels_ == 0).value_counts())
#print(A)
m = np.shape(A)[1]
#为A矩阵最后一列打上标签0
A = np.insert(A,m,0,axis=1)
print(A)
#提出聚簇==1的数据并打上标签
B = X[y_kmeans.labels_ == 1,:]
B = np.insert(B,m,1,axis=1)
# 2
C = X[y_kmeans.labels_ == 2,:]
C = np.insert(C,m,2,axis=1)
# 3
D = X[y_kmeans.labels_ == 3,:]
D = np.insert(D,m,3,axis=1)
#全部添加到A矩阵
A = np.insert(A,np.shape(A)[0],B,axis=0)
A = np.insert(A,np.shape(A)[0],C,axis=0)
A = np.insert(A,np.shape(A)[0],D,axis=0)
#print('AB N:',np.shape(A)[0])
#print(A)
#print('A m:',np.shape(A)[1])
#将矩阵输出--------重命名表头
pd_data = pd.DataFrame(A,columns=['id','userid','dayhot','day','orderhot','order','R','F','E','O','sum','tag'],dtype=str)
pd_data.to_csv('D:\\data\\cnndata\\pd_dataNsocre-1.csv')