目录
算法原理:
算法流程:
编程实现:
import numpy as np
import matplotlib.pyplot as plt
from numpy import * # 导入numpy的库函数
from scipy.spatial.distance import pdist
# 读取txt文件
def load_data(filename):
return np.loadtxt(filename, delimiter="\t")
# 欧氏距离
def dist_Eclud(vecA, vecB):
x = np.square(vecA[0] - vecB[0])
y = np.square(vecA[1] - vecB[1])
return np.sqrt(x + y)
# 麦哈顿距离
def distManhattan(vecA, vecB):
return np.sum(np.abs(vecA - vecB))
# 余弦距离
def distCosine(vecA, vecB):
return pdist(np.vstack([vecA, vecB]), 'cosine')
# 切比雪夫距离
def distChebyshev(vecA, vecB):
return np.max(np.abs(vecA - vecB))
# 初始化质心,得到k个随机质心
def initialize_vec(sample_set, k):
m, n = np.shape(sample_set)
new_sample = np.zeros((k, 2))
for i in range(k):
a = np.random.randint(0, m)
new_sample[i] = sample_set[a]
return new_sample
# 初始化簇类,得到k个空集簇类
def initialize_class(C, k):
for i in range(k):
C[i] = []
return C
# 损失函数计算
def loss(k, A, initialize_mat):
losses = 0
for i in range(k):
for sums in range(len(A[i])):
losses += dist_Eclud(initialize_mat[i], A[i][sums])
return losses / k
# 绘制图像
def plot_img(k, A, initialize_mat, mark, mark1):
for num in range(k):
for sums in range(len(A[num])):
plt.plot(A[num][sums][0], A[num][sums][1], mark[num])
plt.plot(initialize_mat[num][0], initialize_mat[num][1], mark1[num])
plt.show()
# k均值聚类
def kmean(k):
b = 0 # 定义迭代次数
m, n = np.shape(test_set)
new_mat = np.zeros((k, 2))
C = {}
C = initialize_class(C, k)
distance = np.zeros(k)
initialize_mat = initialize_vec(test_set, k)
while b < 400:
for j in range(m):
for i in range(k):
# mat = dist_Eclud(test_set[j], initialize_mat[i])
# mat = distManhattan(test_set[j], initialize_mat[i])
# mat = distCosine(test_set[j], initialize_mat[i])
mat = distChebyshev(test_set[j], initialize_mat[i])
distance[i] = mat
a = list(distance)
min_index = a.index(min(a))
C[min_index].append(test_set[j])
A = C
C = {}
C = initialize_class(C, k)
for i in range(k):
length = len(A[i])
new_mat[i] = sum(A[i], axis=0) / length
if (new_mat[i] == initialize_mat[i]).all():
initialize_mat[i] = initialize_mat[i]
else:
initialize_mat[i] = new_mat[i]
b = b + 1
return A, initialize_mat
k = 4
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
mark1 = ['^b', '+b', 'sb', 'db', '<b', 'pb']
test_set = load_data("testSet.txt")
classed_mat, centre_point = kmean(k)
print("center_point:", centre_point)
losses = loss(k, classed_mat, centre_point)
print("mean_loss:", losses)
plot_img(k, classed_mat, centre_point, mark, mark1)
结果: