import numpy as np
# read data
defloadData(filename):
fr = open(filename)
dataMat = []
for line in fr.readlines(): | readlines readline 是读下一行 readlines 是遍历所有行 |
data1 = line.strip().split('\t') | Strip split strip是删除 split 是分离 |
data = list(map(float,data1)) | 要加list |
dataMat.append(data)
return np.mat(dataMat) | 最后要变成矩阵的形式 |
# cal distance
defdisEclud(vecA,vecB):
returnnp.sqrt(np.sum(np.power(vecA-vecB,2)))
# choose center
defrandCent(data,k):
n = np.shape(data)[1]
cent = np.mat(np.zeros((k,n))) | 构建空矩阵要用元祖形式,也就是要俩个括号 |
for i in range(n):
minD = np.min(data[:,i])
rangeD = np.float(np.max(data[:,i]-minD)) | ?? |
cent[:,i] = minD +rangeD*np.random.rand(k,1)
return cent
## test
#data =loadData('G:/anaconda4/machinelearninginaction/Ch10/testSet.txt')
#print(data)
#result =randCent(data,3)
#print(result)
defKmeans(data,k,distMeas = disEclud,createCent = randCent):
m = np.shape(data)[0]
clusterAssment = np.mat(np.zeros((m,2)))
cent = randCent(data,k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = np.inf
minIndex = -1
for j in range(k):
distJ =distMeas(cent[j,:],data[i,:])
if distJ < minDist:
minDist = distJ
minIndex = j
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2 | 这边为什么要平方 不是很理解》?? |
print(cent)
for ii in range(np.shape(cent)[0]):
a = clusterAssment[:,0] a1 = clusterAssment[:,0].A | 加了.A好像没有很大的区别,之前 看说明是说.A是为了转成array的格式 |
b = clusterAssment[:,0] == ii b1 = clusterAssment[:,0].A == ii | 判断是否相等,出来的结果是布尔型数值True False |
c = np.nonzero(b) | 找出非零元素的位置 多维数据返回的结果是二维的,一个维度是行,一个维度是列 如果是bool型的就是返回True 的位置 |
d = c[0] | 返回True所在的行 |
e = data[d] | True(也就是对应的第k个簇)内包含的元素 |
Clust = data[np.nonzero(clusterAssment[:,0].A == ii)[0]] | 之上的这些abcde都是测试行 |
cent[ii,:] = np.mean(Clust,axis =0)
return cent,clusterAssment
data =loadData('G:/anaconda4/machinelearninginaction/Ch10/testSet.txt')
Kmeans(data,4,distMeas= disEclud,createCent = randCent)
综上 就是把每一个簇的对应数组找出来,然后进行求均值