Python实现的KMeans聚类
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#正则化数据集
def normalize(X,axis=-1,p=2):
lp_norm=np.atleast_1d(np.linalg.norm(X,p,axis))
lp_norm[lp_norm==0]=1
return X/np.expand_dims(lp_norm,axis)
#计算一个样本与所有样本的欧氏距离的平方
def euclidean_distance(one_sample,X):
one_sample=one_sample.reshape(1,-1)
X=X.reshape(X.shape[0],-1)
dis=np.power(np.tile(one_sample,(X.shape[0],1))-X,2).sum(axis=1)
return dis
class Kmeans():
def __init__(self,k=2,max_iteration=500,varepsilon=0.0001):
self.k=k
self.max_iteration=max_iteration
self.varepsilon=varepsilon
#从样本点随机选择聚类中心
def init_random_center(self,X):
n_samples,n_features=np.shape(X)
centers=np.zeros((self.k,n_features))
for i in range(self.k):
center=X[np.random.choice(range(n_samples))]
centers[i]=center
return centers
#返回距离该样本最近的一个中心索引[0,self.k]
def _cloest_center(self,sample,centers):
dis=euclidean_distance(sample,centers)
closest_i=np.argmin(dis)
return closest_i
#将所有样本进行归类
def create_clusters(self,centers,X):
n_samples=np.shape(X)[0]
clusters=[[] for _ in range(self.k)]
for sample_i,sample in enumerate(X):
center_i=self._cloest_center(sample,centers)
clusters[center_i].append(sample_i)
return clusters
#对中心进行更新
def update_centers(self,clusters,X):
n_features=np.shape(X)[1]
centers=np.zeros((self.k,n_features))
for i,cluster in enumerate(clusters):
center=np.mean(X[cluster],axis=0)
centers[i]=center
return centers
def get_cluster_labels(self,clusters,X):
y_pred=np.zeros(X.shape[0])
for cluster_i,cluster in enumerate(clusters):
for sample_i in cluster:
y_pred[sample_i]=cluster_i
return y_pred
def predict(self,X):
centers=self.init_random_center(X)
for _ in range(self.max_iteration):
clusters=self.create_clusters(centers,X)
former_centers=centers
centers = self.update_centers(clusters,X)
diff=centers-former_centers
if diff.any()<self.varepsilon:
break
return self.get_cluster_labels(clusters,X)
def main():
X, y = datasets.make_blobs(n_samples=10000,
n_features=3,
centers=[[3, 3, 3], [0, 0, 0], [1, 1, 1], [2, 2, 2]],
cluster_std=[0.2, 0.1, 0.2, 0.2],
random_state=9)
# 用Kmeans算法进行聚类
clf = Kmeans(k=4)
y_pred = clf.predict(X)
# 可视化聚类效果
fig = plt.figure(figsize=(12, 8))
ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=20)
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], X[y == 0][:, 2])
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], X[y == 1][:, 2])
plt.scatter(X[y == 2][:, 0], X[y == 2][:, 1], X[y == 2][:, 2])
plt.scatter(X[y == 3][:, 0], X[y == 3][:, 1], X[y == 3][:, 2])
plt.show()
if __name__=="__main__":
main()