import pandas as pd
import numpy as np
import xlrd
# 导入数据
df2 = pd.read_excel("test2.xlsx")
data = np.array(df2)
# 去掉前两列
data = data[:, 2:]
# 分为k类
k = 3
# 临近均值E
E = 0.00001
# 获得行数和列数
(row, line) = data.shape
# 随机分配中心点
datak = np.array([data[1, :], data[2, :], data[10, :]])
# print(datak.shape)
# 初始化距离
d = np.zeros((k, row, 1))
while 1:
for i in range(k):
# 与中心点做差
dev = abs(data - datak[i])
# 用曼哈顿距离求距离
d[i] = np.sum(dev, axis=1, keepdims=True)
# 返回最小值下标
mind = np.argmin(d, axis=0)
# 生成(row,0)秩
mind = np.array(mind[:, 0])
# print(mind)
kx0 = np.where(mind == 0)
kx0 = np.array(kx0)
# print(kx0)
kx1 = np.where(mind == 1)
kx1 = np.array(kx1)
kx2 = np.where(mind == 2)
kx2 = np.array(kx2)
# print(kx2.shape)
kx = np.array([kx0[0], kx1[0], kx2[0]])
# print(kx[0])
# 计算中心点和误差
# 初始化方差e
e = np.zeros((2, k, 1))
for i in range(k):
# 获取第i个对象的行数
linex = kx[i].shape
linex = linex[0]
# print(linex)
# 初始化第I个类包含的对象
# print(linex, line)
data_x1 = np.zeros((linex, line))
data_x2 = np.zeros((k, linex, line))
# print(data_x.shape)
for j in range(linex):
# 每个对象和第i个中心点作差
data_x1[j] = abs(data[kx[i][j]]-datak[i])
data_x2[i, j] = data[kx[i][j]]
# 求方差
e[0, i, 0] = np.sum(np.sum(data_x1**2))
# print(data_x1.shape)
# print(i, e[0, i, 0])
# print(data_x2.shape)
# 更新第i类的中心点
# a = np.mean(data_x2, axis=0)
# print(a.shape)
datak[i] = np.mean(data_x2[i], axis=0)
for j in range(linex):
# 每个对象和第i个中心点作差
data_x1[j] = abs(data[kx[i][j]]-datak[i])
e[1, i, 0] = np.sum(np.sum(data_x1 ** 2))
# print(i, e[1, i, 0])
# print(data_x)
# print(datak[i])
# print(e[0, 0], e[1, 0], e[2, 0])
if e[0, 0, 0]-e[1, 0, 0] <= E and e[0, 1, 0]-e[1, 1, 0] <= E and e[0, 2, 0]-e[1, 2, 0] <= E:
break
print("success!")
# 打印类和类中心
for i in range(k):
print(kx[i].shape)
print(kx[i])
print(datak[i])
测试数据
运行结果:
!!!
时间复杂度和空间复杂度可以改进!
抛砖引玉,希望大家积极评论!