#学习自机器学习实战
from numpy import *
def loadDataSet(fileName):
dataMat=[]
fr=open(fileName)
for line in fr.readlines():
curLine=line.strip().split('\t')
#curLine=float(curLine)
dataMat.append(curLine)
return dataMat
def distEclud(vecA,vecB):
n=shape(vecA)[1]
print("n=" ,n)
s=0.0
for i in range(n):
a=vecA[0, i]
print("a=" ,a)
b=float(vecB[i])
print("b=" ,b)
s+=sqrt(power(a-b,2))
print(s)
return s
def randCent(dataSet,k):
dataMat=array(dataSet)
#print("dataMat:" ,dataMat)
n=shape(dataMat)[1]
centriose=mat(zeros((k,n)))
for j in range(n):
minJ=float(min(dataMat[:,j]))
#print(minJ)
maxJ=float(max(dataMat[:,j]))
#print(maxJ)
rangeJ=float(maxJ)-float(minJ)
centriose[:,j]=minJ+rangeJ*random.rand(k,1)
return centriose
def changtofloat(dataSet):
m,n=shape(dataSet)
dset=mat(zeros((m,n)))
for i in range(m):
for j in range(n):
dset[i,j]=float(dataSet[i,j])
print(dset)
return dset
def kMeans(dataset,k,disMeans=distEclud,createCent=randCent):
dataSet=array(dataset)
m=shape(dataSet)[0]
clusterAssment=mat(zeros((m,2)))
cent=createCent(dataset,k)
clusterChanged=True
while clusterChanged:
clusterChanged=False# unchange
for i in range(m):
minDict=inf;minIndex=-1#initilize
for j in range(k):#find the minest distance
distJI=disMeans(cent[j,:],dataSet[i,:])
if distJI<minDict:
minDict=distJI
minIndex=j
if clusterAssment[i,0]!=minIndex:clusterChanged=True#update the class
clusterAssment[i,:]=minIndex,minDict**2
dsett=changtofloat(dataSet)
for cnt in range(k):
pstInClust=dsett[nonzero(clusterAssment[:,0].A==cnt)[0]]
cent[cnt,:]=mean(pstInClust,axis=0)#update the center
return cent,clusterAssment