基础部分注释:
from numpy import *
import operator
from os import listdir
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]#第一维的值,例如dataSet为group,则结果为4
diffMat = tile(inX, (dataSetSize,1)) - dataSet#求差,tile作用:重复,后面是沿着各个维度重复次数
sqDiffMat = diffMat**2#求差的平方
sqDistances = sqDiffMat.sum(axis=1)#axis = 1每一行相加,axis = 0每一列相加
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()#将元素从小到大排列,并输出下标的index
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#存入当前label以及对应的类别值,d.get(k, v)意思是如果k在d中,则返回d[k],否则返回v
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)#对类别字典进行逆排序,级别数目多的往前放,operator.itemgetter(1)代表和元素的第一个域进行比较
return sortedClassCount[0][0]
上述是knn部分的代码,使用方式为
group, labels = kNN.createDataSet();#创建数据集
kNN.classify0([0,0], group, labels, 3);#
归一化特征值:
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
使用方法:
normMat, ranges, minvals = kNN.autoNorm(datingDataMat)