#Machine Learning in Action: 机器学习实战, [美]Perer Harrington,K-均值聚类算法 from numpy import * def loadDataSet(fileName): dataMat = [] fr = open(fileName) for line in fr.readlines(): curLine = line.strip().split('\t') #strip()--移除字符串头尾指定的字符(默认为空格)或字符序列。split('\t') --指定分隔符对字符串进行切片 fltLine = list(map(float, curLine)) dataMat.append(fltLine) # print("aaa:",list(fltLine)) return dataMat def distEclud(vecA, vecB): return sqrt(sum(power(vecA - vecB, 2))) def randCent(dataSet, k): n = shape(dataSet)[1] #shape:(80,2) centroids = mat(zeros((k, n))) #构建质心 for j in range(n): minJ = min(dataSet[:,j]) rangeJ = float(max(dataSet[:,j]) - minJ) centroids[:,j] = minJ + rangeJ * random.rand(k,1) #rand()--生成随机数[0,1), k行1列 ,centroids--质心、中心 # print("ccc:\n", centroids) return centroids def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): m = shape(dataSet)[0] #80 clusterAssment = mat(zeros((m,2))) #一列记录簇索引值,一列记录存储误差 centroids = createCent(dataSet, k) #随机初始化聚类中心 clusterChanged = True #标定值 while clusterChanged: clusterChanged = False for i in range(m): minDist = inf #比较大的值 minIndex = -1 for j in range(k): distJI = distMeas(centroids[j,:],dataSet[i,:]) #distEclud--计算每个点到中心的距离 if distJI < minDist: minDist = distJI minIndex = j if clusterAssment[i,0] != minIndex: clusterChanged = True clusterAssment[i,:] = minIndex, minDist**2 print(centroids) for cent in range(k): #更新质心的位置 ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]] #nonzero--返回非零元素的索引,A==cent将数字变为布尔值:True、False centroids[cent,:] = mean(ptsInClust, axis=0) #按列平均,[4, 2] # print("mmm:\n", (clusterAssment[:,0].A==cent)) return centroids, clusterAssment if __name__ == '__main__': datMat = mat(loadDataSet('testSet.txt')) #mat: matrix randCent(datMat, 2) distance = distEclud(datMat[0],datMat[1]) centroids, clustAssment = kMeans(datMat, 4) # print("ddd:", centroids)
K-Means--机器学习实战-Peter Harrington
最新推荐文章于 2022-02-22 18:55:10 发布