from numpy import *
import operator
def createDataSet():#创建训练集
group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels=['A','A','B','B']
return group,labels
def classify0(inX,dataSet,labels,k):
#index为输入 如[2,3],dataset为训练样本集,k为选择最近邻居的数目
dataSetSize=dataSet.shape[0] #行数
diffMat=tile(inX,(dataSetSize,1))-dataSet
sqDiffMat=diffMat**2
sqDistances=sqDiffMat.sum(axis=1)#行向量相加
distances=sqDistances**0.5 #计算欧氏距离
sortedDisIndicies=distances.argsort() #距离由小到大的数组索引值,
classCount={}
for i in range(k):
voteIlabel=labels[sortedDisIndicies[i]] #统计前k个点所在类别出现的频率
classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)#按类别的频率书降序排列
return sortedClassCount[0][0]#排第一的类别
group,labels=createDataSet()
myclass=classify0([0,0],group,labels,3)
print myclass
结果: B
# -*- coding:utf-8 -*-
from numpy import *
import operator
def classify0(inX,dataSet,labels,k):
#index为输入 如[2,3],dataset为训练样本集,k为选择最近邻居的数目
dataSetSize=dataSet.shape[0] #行数
diffMat=tile(inX,(dataSetSize,1))-dataSet
sqDiffMat=diffMat**2
sqDistances=sqDiffMat.sum(axis=1)#行向量相加
distances=sqDistances**0.5 #计算欧氏距离
sortedDisIndicies=distances.argsort() #距离由小到大的数组索引值,
classCount={}
for i in range(k):
voteIlabel=labels[sortedDisIndicies[i]] #统计前k个点所在类别出现的频率
classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)#按类别的频率书降序排列
return sortedClassCount[0][0]#排第一的类别
def file2matrix(filename):#数据处理
fr=open(filename)
arrayOfLines=fr.readlines()
numberOfLines=len(arrayOfLines)
returnMat=zeros((numberOfLines,3))
classLabelVector=[]
i=0
for line in arrayOfLines:
line=line.strip()
listFromLine=line.split('\t')
returnMat[i,:]=listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
i=i+1
return returnMat,classLabelVector
def autoNorm(dataSet):#归一化特征值
minValues=dataSet.min(0)
maxValues=dataSet.max(0)
DValues=maxValues-minValues
normDataSet=zeros(shape(dataSet))
m=dataSet.shape[0]
normDataSet=dataSet-tile(minValues,(m,1))
normDataSet=normDataSet/tile (DValues,(m,1))
return normDataSet,DValues,minValues
def datingClassTest():#划分训练集和测试集
hoRatio=0.1#10%用于test
datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
normMat,Dvalues,minValues=autoNorm(datingDataMat)
m=normMat.shape[0]
numTestVecs=int(m*hoRatio)
errorCount=0.0
for i in range(numTestVecs):
classifierResult=classify0(normMat[i,:],normMat[numTestVecs:,:],datingLabels[numTestVecs:],3)
print "the classifier comeback with:%d,the real answer is:%d"%(classifierResult,datingLabels[i])
if(classifierResult!=datingLabels[i]):
errorCount+=1.0
print "the total error rate is:%f"%(errorCount/float(numTestVecs))