1.手写算法实现k-means
import numpy as np
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
# 随机产生k个点
def init_centroids(k, n_features):
return np.random.random_sample([k, n_features])
# points代表数据点
def update_centroids(points, centroid_index, n_features):
k = max(centroid_index) + 1
new_centroids = np.zeros((k, n_features))
# print(points[centroid_index == 1].mean(axis=0)) # 按列计算平均值,即计算x,y的坐标平均值
for i in range(k):
new_centroids[i] = points[centroid_index == i].mean(axis=0) # 按照列计算均值
return new_centroids
def distance(pointA, pointB):
return np.sqrt((pointA[0] - pointB[0]) ** 2 + (pointA[1] - pointB[1]) ** 2)
def belong(point, centroids):
index = 0
min_distance = np.inf # 初始化距离为无穷大
for i in range(len(centroids)):
d = distance(point, centroids[i])
if d < min_distance:
min_distance = d
index = i
return index
def update_index(points, centroids):
new_indeces = np.zeros((len(points)))
for i, point in enumerate(points):
new_indeces[i] = belong(point, centroids)
new_indeces = new_indeces.astype(int)
return new_indeces
# 完整的自我定义的k-means
def my_kmeans(points):
centroids = init_centroids(10, 2)
indeces = update_index(points, centroids)
for i in range(1000):
old_indeces = indeces
centroids = update_centroids(points, indeces, 2)
indeces = update_index(points, centroids)
if np.array_equal(indeces, old_indeces):
print("运行第{}次跳出循环".format(i))
break
return centroids, indeces
digits = load_digits()
# plt.matshow(digits.images[1])
# plt.show()
'''
# 画出数据像素点所代表的图形
images_and_labels = list(zip(digits.images, digits.target))
plt.figure(figsize=(10, 10))
for index, (image, label) in enumerate(images_and_labels[:25]):
plt.subplot(5, 5, index + 1)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.binary)
plt.title('Training: %i' % label)
plt.show()
'''
# 使用PCA对数据进行降维
data = scale(digits.data) # 对数据按列属性进行scale处理后,每列的数据均值变成0,标准差变为1。
pca = PCA(n_components=2).fit(data)
points = PCA(n_components=2).fit_transform(data)
plt.plot(points[:, 0], points[:, 1], 'k.', markersize=2)
# 对数据执行k-means
centroids, indeces = my_kmeans(points)
# 对k-means执行后的结果进行更加直接的可视化
h = 0.02
x_min, x_max = points[:, 0].min() - 1, points[:, 0].max() + 1
y_min, y_max = points[:, 1].min() - 1, points[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # 生成网格点坐标矩阵
# .ravel()将多维数组转换成一维数组 np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等
all_points = np.c_[xx.ravel(), yy.ravel()]
Z = update_index(all_points, centroids)
print(Z)
Z = Z.reshape(xx.shape)
# 给图片编号
plt.figure(1)
# 清除所有轴
plt.clf()
# imshow通过色差,亮度显示数据的差异
plt.imshow(Z, aspect='auto',extent=(xx.min(), xx.max(), yy.min(), yy.max()))
plt.plot(points[:, 0], points[:, 1], 'k.', markersize=2)
plt.title("k-mean聚类效果显示")
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.show()
实现结果
2.调用库函数实现k-means
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
import numpy as np
# 加载数据
digits = load_digits()
data = scale(digits.data) # 对数据按列属性进行scale处理后,每列的数据均值变成0,标准差变为1。
pca = PCA(n_components=2).fit(data)
points = PCA(n_components=2).fit_transform(data)
k_means = KMeans(n_clusters=10, max_iter=1500, init='k-means++', n_init=10)
k_means.fit(points)
centroids = k_means.cluster_centers_
h = 0.02
x_min, x_max = points[:, 0].min() - 1, points[:, 0].max() + 1
y_min, y_max = points[:, 1].min() - 1, points[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # 生成网格点坐标矩阵
# all_points = np.c_[xx.ravel(), yy.ravel()]
# Z = update_index(all_points, centroids)
Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 给图片编号
plt.figure(2)
# 清除所有轴
plt.clf()
# imshow通过色差,亮度显示数据的差异
plt.imshow(Z, aspect='auto', extent=(xx.min(), xx.max(), yy.min(), yy.max()))
plt.plot(points[:, 0], points[:, 1], 'k.', markersize=2)
plt.title("k-mean聚类效果显示")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.show()
效果显示