"""
@Title: K-means
@Time: 2024/2/16
@Author: Michael Jie
"""
import numpy as np
from sklearn.cluster import KMeans
print("----------手动实现K-means----------")
# 数据集,两个特征
data_sets = [[1, 2],
[4.5, 0.2],
[95, 97],
[95.5, 95],
[2.9, 1],
[96, 100],
[4.2, 3.2],
[98.7, 95],
[2.1, 1.1],
[95.4, 99]]
data_sets = np.array(data_sets)
# 数据集长度
m = len(data_sets)
# 聚类数
n = 2
# 初始化组中点
x1 = data_sets[0]
x2 = data_sets[-1]
print("初始的组中点为:({x1}, {x2})".format(x1=x1, x2=x2))
# 将数据集按组分类,数据集比较简单,无需迭代
sets1 = []
sets2 = []
for data_set in data_sets:
dev1 = np.sum(np.abs(data_set - x1))
dev2 = np.sum(np.abs(data_set - x2))
# 判断数据靠近哪个中点
if dev1 > dev2:
sets1.append(data_set)
else:
sets2.append(data_set)
print("组数据为:({set1}, {set2})".format(set1=sets1, set2=sets2))
# 更新中心位置
sets1 = np.array(sets1)
sets2 = np.array(sets2)
x1[0] = np.sum(sets1.T[0]) / len(sets1)
x1[1] = np.sum(sets1.T[1]) / len(sets1)
x2[0] = np.sum(sets2.T[0]) / len(sets2)
x2[1] = np.sum(sets2.T[1]) / len(sets2)
print("分组后的中点为:({x1}, {x2})".format(x1=x1, x2=x2))
# 计算损失
loss = 0
for set1 in sets1:
loss += np.sum(np.square(set1 - x1))
for set2 in sets2:
loss += np.sum(np.square(set2 - x2))
print("损失为:{loss}".format(loss=loss))
print("----------使用sklearn实现K-means----------")
# 数据集
data_sets = [[1, 2],
[4.5, 0.2],
[95, 97],
[95.5, 95],
[2.9, 1],
[96, 100],
[4.2, 3.2],
[98.7, 95],
[2.1, 1.1],
[95.4, 99]]
# 创建KMeans对象
k_means = KMeans(2, random_state=0)
k_means.fit(data_sets)
print("分组后的中点为:{x}".format(x=k_means.cluster_centers_))
print("损失为:{loss}".format(loss=k_means.inertia_))
"""
----------手动实现K-means----------
初始的组中点为:([1. 2.], [95.4 99. ])
组数据为:([array([95., 97.]), array([95.5, 95. ]), array([ 96., 100.]), array([98.7, 95. ]), array([95.4, 99. ])],
[array([1., 2.]), array([4.5, 0.2]), array([2.9, 1. ]), array([4.2, 3.2]), array([2.1, 1.1])])
分组后的中点为:([96.12 97.2 ], [2.94 1.5 ])
损失为:43.360000000000014
----------使用sklearn实现K-means----------
分组后的中点为:[[96.12 97.2 ]
[ 2.94 1.5 ]]
损失为:43.36000000000001
"""