简单实现在鸢尾花数据的聚类,经过验证效果不错。数据在机器学习之k-nn文章里。
import numpy as np
import csvimport random
def loadData(filename):
key_value = {'Iris-setosa':0,'Iris-versicolor':1,'Iris-virginica':2}
all_data_label = []
with open(filename,'rb') as csvfile:
lines = csv.reader(csvfile)
for i in lines:
temp = [float(i[0]), float(i[1]), float(i[2]), float(i[3])]
temp1 = [key_value.get(i[4])]
temp.extend(temp1)
all_data_label.append(temp)
return all_data_label
def cal_distances(train_vec,test_vec):
train_vec = np.array(train_vec)
test_vec = np.array(test_vec)
return np.sqrt(sum(train_vec-test_vec)**2)
def randCenter(dataSet,k):
cols = dataSet.shape[1]
centroids = np.zeros((k,cols))
for j in range(cols):
min_data = min(dataSet[:,j])
max_data = max(dataSet[:,j])
range_data = max_data-min_data
centroids[:,j] = min_data + range_data *np.random.random((k,))
return centroids
def k_mearns(dataSet,k):
rows = dataSet.shape[0]
clusterAssment = np.zeros((rows,2))
centroids = randCenter(dataSet,k)
cluster_change = True
while cluster_change:
cluster_change = False
for i in range(rows):
minDist = 10000
minIndex = -1
for j in range(k):
distances = cal_distances(dataSet[i,:],centroids[j,:])
if distances < minDist:
minDist = distances
minIndex = j
if clusterAssment[i,1] != minIndex:
cluster_change = True
clusterAssment[i,:] = minDist,minIndex
for cent in range(k):
bool_value = [clusterAssment[:,1]==cent]
data = dataSet[bool_value]
#print data
centroids[cent,] = np.mean(data,axis=0)
return centroids,clusterAssment
if __name__ == "__main__":
all_data_label = np.asarray(loadData('./Iris.data'))
data = all_data_label[:,:4]
#random.shuffle(data)
center,cluster=k_mearns(data,3)
#print center,
print cluster