KMeans算法实现

import numpy as np

def load_data(file_name):
f = open(file_name)
data = []
for line in f.readlines():
row = []
lines = line.strip().split(’\t’)
for x in lines:
row.append(float(x))
data.append(row)
f.close()
return np.mat(data)

def distance(vecA, vecB):
distance = (vecA - vecB) * (vecA - vecB).T
return distance

def randCent(data, k):
‘’’
随机初始化聚类中心
input: data(需要分类的数据,mat); k(类别个数,int)
output: 中心点的坐标
‘’’
n = np.shape(data)[1]
centroids = np.mat(np.zeros([k, n]))
for j in range(n):
rangeJ = np.max(data[:, j]) - np.min(data[:, j])
centroids[:, j] = np.min(data[:, j]) * np.mat(np.ones((k , 1)))
+ np.random.rand(k, 1) * rangeJ
return centroids

def kmeans(data, k, centroids):
‘’’
分类
input: data(需要分类的数据);k(类别个数); initial centroids
output: final centroids, subCenter
‘’’
m, n = np.shape(data) # m: 样本个数,n: 特征维数
subCenter = np.mat(np.zeros((m, n)))# 初始化每一个样本的分类结果,第一列记录类别,第二列记录与聚点的距离
change = True
while change:
for i in range(m):
change = False
minDist = np.inf
minIndex = 0
for j in range(k):
dist = distance(data[i, ], centroids[j, ])
if dist < minDist:
minDist = dist
minIndex = j
if subCenter[i, 0] != minIndex:
change = True
subCenter[i, ] = np.mat([minIndex, minDist])
# 分类完成后,重新计算聚类中心, 大循环的是种类
for j in range(k):
sum_all = np.mat(np.zeros((1, n)))
r = 0
for i in range(m):
if subCenter[i, 0] == j:
sum_all += data[i, 1]
r += 1
for z in range(n):
try:
centroids[j, z] = sum_all[0, z] / r
except:
print(‘r is zero’)
return subCenter, centroids

def save_model(file_name, source):
f = open(file_name, ‘w’)
m, n = np.shape(source)
for i in range(m):
temp = []
for j in range(n):
temp.append(str(source[i, j]))
f.write(’\t’.join(temp) + ‘\n’)
f.close()

if name == ‘main’:
k = 4
file_path = ‘kmeans.txt’
print(’---------------1. 载入数据---------------’)
data = load_data(file_path)
print(’---------------2. 随机初始化聚类中心---------------’)
init_centroids = randCent(data, k)
print(‘The centroids: %s’ % init_centroids)
print(’---------------3. 执行kmeans分类---------------’)
subcen, fin_centroids = kmeans(data, k, init_centroids)
print(’---------------4. 输出分类结果到文本---------------’)
save_model(‘sub.txt’, subcen)
save_model(‘center.txt’, fin_centroids)

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值