import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.datasets import make_blobs
np.random.seed(123)
X,y = make_blobs(centers = 4,n_samples = 1000) #make_bolbs 是为聚类产生数据集
#n_samples 是待生成的样本的总数 n_samples = 1000 对应于 m = 1000
#centers 表示类别数 centers = 4代表可以分成四个簇
print(f'Shape of dataset:{X.shape}')
# 展示初始数据集
fig = plt.figure(figsize = (8,6)) #表示figure的大小为宽为6,长为8(inch)
plt.scatter(X[:,0],X[:,1])
plt.title("Dataset with 4 clusters")
plt.xlabel("First feature")
plt.ylabel("Second feature")
plt.show()
class KMeans():
def __init__(self,n_clusters):
self.k = n_clusters
def fit(self,data):
"""
使KMeans聚类算法适用于训练集
"""
n_samples,_ = data.shape
#初始化聚类中心,根据k值随机生成k个样本点
self.centers = np.array(random.sample(list(data),self.k))
self.initial_centers = np.copy(self.centers)
# 跟踪样本点的分配直到聚类中心停止移动,停止移动时完成聚类
old_assigns = None
n_iters = 0
while True:
new_assigns = [self.classify(datapoint) for datapoint in data]
if new_assigns == old_assigns:
print(f"Training finished after {n_iters} iterations")
return
old_assigns = new_assigns
n_iters += 1 #Python 不支持 i++ 这种自增语法
for id_ in range(self.k):
points_idx = np.where(np.array(new_assigns) == id_)
datapoints = data[points_idx]
self.centers[id_] = datapoints.mean(axis = 0)
def l2_distance(self,datapoint):
dists = np.sqrt(np.sum((self.centers - datapoint)**2,axis = 1))
return dists
def classify(self,datapoint):
"""
给定一个样本点,计算它距离哪一个聚类中心最近,返回聚类中心的索引
"""
dists = self.l2_distance(datapoint)
return np.argmin(dists)
# argmin 给出了水平方向上最小的下标值
def plot_clusters(self,data):
plt.figure(figsize=(12,10))
plt.title("Initial centers in black, final centers in red")
plt.scatter(data[:, 0], data[:, 1], marker='.', c=y)
plt.scatter(self.centers[:, 0], self.centers[:, 1], c='r')
plt.scatter(self.initial_centers[:, 0], self.initial_centers[:, 1], c='k')
plt.show()
if __name__ == '__main__':
kmeans = KMeans(n_clusters=4)
kmeans.fit(X) #运行聚类算法
kmeans.plot_clusters(X)
#可见,聚类算法并不能每一次都得到全局最优解,可以通过反复运行,以得到全局最优解
Python实现KMeans聚类算法
最新推荐文章于 2024-03-25 22:06:02 发布