实验要求(data.txt)
(1)算法原理描述
(2)算法结构
(3)写出K-means具体功能函数(不能直接调用sklearn.cluster(Means)功能函数)
具体函数功能中返回值包括 数据类标签,类中心,输入包括:数据,类别数
(4)可视化画图,不同类数据采用不同颜色
(5)算法分析 类类方差,平均方差,不同初始点对聚类结果的影响?如何解决?
python代码
import numpy as np
from matplotlib import pyplot as plt
plt.rcParams['font.family'] = ['Microsoft YaHei']
class K_Means(object):
def __init__(self, k=2, tolerance=0.0001, max_iter=300):
self.k_ = k
self.tolerance_ = tolerance
self.max_iter_ = max_iter
def fit(self, data):
self.centers_ = {}
for i in range(self.k_):
self.centers_[i] = data[i+19]
for i in range(self.max_iter_):
self.clf_ = {}
for i in range(self.k_):
self.clf_[i] = []
for feature in data:
distances = []
for center in self.centers_:
distances.append(np.linalg.norm(feature - self.centers_[center]))
classification = distances.index(min(distances))
self.clf_[classification].append(feature)
prev_centers = dict(self.centers_)
for c in self.clf_:
self.centers_[c] = np.average(self.clf_[c], axis=0)
optimized = True
for center in self.centers_:
org_centers = prev_centers[center]
cur_centers = self.centers_[center]
if np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_:
optimized = False
if optimized:
break
# 读取data.txt里面的值
data = np.loadtxt('data.txt', dtype=float, delimiter=" ")
X = data[..., 0]
Y = data[..., 1]
plt.title("散点图观察特诊")
plt.xlabel("X轴")
plt.ylabel("Y轴")
plt.plot(X, Y, 'ob')
plt.show()
if __name__ == '__main__':
k_means = K_Means(k=4)
k_means.fit(data)
print(k_means.centers_)
rgb = ['r', 'g', 'b', 'y']
for center in k_means.centers_:
plt.scatter(k_means.centers_[center][0], k_means.centers_[center][1], marker='*', s=150)
for cat in k_means.clf_:
for point in k_means.clf_[cat]:
plt.scatter(point[0], point[1], c=rgb[cat])
plt.show()