1 大话k-NN
k-NN就是拿输入与模型中的数据进行比较,比较的方式可以是计算距离(如欧式距离,即二范数),对这些距离进行升序,选择前k个,然后从这个k个中选择标签众数。说白了就是看与谁最接近,那么就和你同类。
2 k-NN 一般流程
- 收集数据: 可以使用任何方法。
- 准备数据:距离计算,最好在计算前归一化,可以使用数据与最小的差值,除以该列特征最大与最小差值(即该特征的范围)。
- 分析数据:比如可以可视化。
- 训练数据:此步骤不适合k-NN,因为k-NN不需要训练得出模型的参数,每次都是将输入与所有样本比较。
- 测试算法:可以计算错误率,即错误的次数除以总共测试的次数 。
- 使用算法
3 书中例子
3.1 约会
模型的场景:A想要对约会的对象进行分类:不喜欢的、魅力一般的、极具魅力的。以此决定怎么合理安排时间约会。要求输入一些对象的特征,得出这个对象的分类 。
3.2 识别手写数字
模型场景:将所有数字处理成黑白图像32*32,像素点的值只有0和1。要求输入图像,得出该图像的类别。
3.3 思路
可以采用k-NN的算法,收集带标签的样本,然后拿输入的数据和这些样本进行比较,利用k-NN算法,得出该输入的分类,明确k-NN的本质,就是得出离样本最近的众数的那个类别。
3.4 评价模型
可以利用错误率。可以将样本数据随机抽出一部分作为测试集,用测试集去检验模型的好坏。
3.5 code
from numpy import *
import operator
from os import listdir
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
def datingClassTest():
hoRatio = 0.10 #hold out 10%
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
# print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
def img2vector(filename):
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('digits/trainingDigits') #load the training set
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('digits/trainingDigits/%s' % fileNameStr)
testFileList = listdir('digits/testDigits') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
# print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount/float(mTest))
def main():
print 'Test 1 createDataSet'
group, labels = createDataSet()
print group
print labels
label = classify0([0,0], group, labels, 3)
print label
print 'Test 2 engagement'
datingClassTest()
print 'Test 3 handwriting'
handwritingClassTest()
if __name__ == '__main__':
main()
3.6 运行
/Users/tl/.pyenv/versions/2.7.13ML/bin/python "/Applications/PyCharm CE.app/Contents/helpers/pydev/pydevd.py" --multiproc --qt-support --client 127.0.0.1 --port 65195 --file /Users/tl/Works/MLiA/0320/Ch02/kNN.py
pydev debugger: process 17368 is connecting
Connected to pydev debugger (build 163.15188.4)
Test 1 createDataSet
[[ 1. 1.1]
[ 1. 1. ]
[ 0. 0. ]
[ 0. 0.1]]
['A', 'A', 'B', 'B']
B
Test 2 engagement
the total error rate is: 0.050000
5.0
Test 3 handwriting
the total number of errors is: 11
the total error rate is: 0.011628
Process finished with exit code 0
参考
1.机器学习实战