1,准备数据:
从文件中读出文件(数据来自海伦约会网站例子);展示如图
2 ,数据归一化
主要公式:newvalue =(oldvalue-min)/(max-min)
>>> tmpmat,rangs,mins =knn.autonorm(mat)
>>> tmpmat
array([[ 0.44832535, 0.39805139, 0.56233353],
[ 0.15873259, 0.34195467, 0.98724416],
[ 0.28542943, 0.06892523, 0.47449629],
...,
[ 0.29115949, 0.50910294, 0.51079493],
[ 0.52711097, 0.43665451, 0.4290048 ],
[ 0.47940793, 0.3768091 , 0.78571804]])
3,分类knn算法代码
def classify(testvecs,dataset,labels,k):
datasetsize =dataset.shape[0]
diffmat =tile(testvecs,(datasetsize,1)) -dataset
sqdiffmat= diffmat**2
sqdistance=sqdiffmat.sum(axis=1)
distance =sqdistance**0.5
sorteddiss=distance.argsort()
classcount = {}
for i in range(k):
votelabel =labels[sorteddiss[i]]
classcount[votelabel] = classcount.get(votelabel,0)+1.0
sortedclasscount =sorted(classcount.iteritems(),key=operator.itemgetter(1),reverse=True)
4,测试
def datingtest(int_k):
horatio =0.1
datingdatamat ,labels =file2Mat('D:\python-机器学习\datingTestSet2.txt')
normmat,ranges,minvals =autonorm(datingdatamat)
m=normmat.shape[0]
numtestvecs =int (m*horatio)
errorcount = 0.0
for i in range (numtestvecs):
classifyresult =classify(normmat[i,:],normmat[numtestvecs:m,:],labels[numtestvecs:m],int_k)
if(classifyresult != labels[i]):
errorcount=errorcount+1.0
print "the toal error rate is :%f" %(errorcount/float(numtestvecs))
>>> reload(knn)
<module 'knn' from 'D:\python-机器学习\knn.py'>
>>> knn.datingtest(4)
the toal error rate is :0.030000
出错的点在交集处