kNN算法在简单二维数据上计算时:d=根号((x0-x)^2 -(y0-y)^2).
这里被推广到1024维,将32*32二进制图片当成1*1024的向量。计算上和二维是一样的。
缺点是计算量太大了。
一、数据集
手写字图片被保存为32*32的二进制文件:
训练文件trainingDigits有1900多个,测试文件testDigits有900多个。
点击下载
二、训练算法
#!/usr/bin/python
# -*- coding: utf-8 -*
#用KNN识别手写数字
from numpy import *
import operator
from os import listdir
#把32*32的二进制图像矩阵转换为1*1024的向量
def img2vector(filename):
resultVect = zeros((1,1024))
fo = open(filename)
for i in range(32):
lineStr = fo.readline()
for j in range(32):
resultVect[0,32*i+j] = int(lineStr[j])
return resultVect
#inX是输入向量,dataSet是训练样本,标签向量是labels,k是选择最近邻居的数目
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1) #按行计算
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #元素从小到大排列,提取其对应的index(索引),
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #get返回指定值,不存在则返回0
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0] #返回最小的值
#手写数字识别,测试正确率
def handwritingTest():
Labels = []
trainFileList = listdir('trainingDigits')
l = len(trainFileList)
trainMat = zeros((l,1024)) #保存所有的训练向量
for i in range(l):
fileName = trainFileList[i]
fileNameNum = int(fileName.split('_')[0])
Labels.append(fileNameNum)
trainMat[i,:] = img2vector('trainingDigits/%s' % fileName)
testFileList = listdir('testDigits')
lt = len(testFileList)
err = 0.0
for i in range(lt):
fileName = testFileList[i]
fileNameNum = int(fileName.split('_')[0])
VectorTest = img2vector('trainingDigits/%s' % fileName) #获得一个文件的向量
classResult = classify0(VectorTest,trainMat,Labels,4)
print "Train: %d,Ture: %d" % (classResult,fileNameNum)
if(classResult != fileNameNum):
err += 1.0
print "\n errnum is %d" % err
print "\n errrate is %f" % (err/float(lt))
if __name__ == '__main__':
handwritingTest()
三、算法结果
改变k的值会改变正确率率,这里选取k=4