所给的数据是已经使用图形处理,处理成32像素*32像素的黑白图像:
数据来源于github
代码用python3写
1. kNN算法核心:
def classify0(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
diffMat=tile(inX,(dataSetSize,1))-dataSet
sqDiffMat=diffMat**2
sqDistances=sqDiffMat.sum(axis=1)
distances=sqDistances**0.5
sortedDistIndicies=distances.argsort()
classCount={}
for i in range(k):
voteIlabel=labels[sortedDistIndicies[i]]
classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
2. 读取文本,将图片数据转为向量:
def img2vector(filename):
returnVect=zeros((1,1024))
fr=open(filename)
for i in range(32):
lineStr=fr.readline()
for j in range(32):
returnVect[0,32*i+j]=int(lineStr[j])
return returnVect
3. 手写数字识别系统的测试代码:
listdir 函数需要引入:
def handwritingClassTest():
hwLabels=[]
trainingFileList=listdir('/Users/xxx/Downloads/digits/trainingDigits')
m=len(trainingFileList)
trainingMat=zeros((m,1024))
for i in range(m):
fileNameStr=trainingFileList[i]
fileStr=fileNameStr.split('.')[0]
classNumStr=int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:]=img2vector('/Users/xxx/Downloads/digits/trainingDigits/%s' %fileNameStr)
testFileList=listdir('/Users/xxx/Downloads/digits/testDigits')
errorCount=0.0
mTest=len(testFileList)
for i in range(mTest):
fileNameStr=testFileList[i]
fileStr=fileNameStr.split('.')[0]
classNumStr=int(fileStr.split('_')[0])
vectorUnderTest=img2vector('/Users/xxx/Downloads/digits/testDigits/%s' %fileNameStr)
classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3)
print("the classifier came back with:%d, the real answer is %d" %(classifierResult,classNumStr))
if(classifierResult != classNumStr):
errorCount += 1.0
print("\n the total number of errors is: %d" %errorCount)
print("\n the total error rate is: %f" %(errorCount/float(mTest)))
错误率为1.12%