程序清单2-6 手写数字识别系统的测试代码
本笔记将主要注意力放在理解代码上,所以大家看代码中的注释即可
伪代码
def img2vector(filename):#返回1*1024行向量
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
#一次只读一行
lineStr = fr.readline()
for j in range(32):
#行复制
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwLabels = []
# listdir 可以列出trainingDigits文件夹目录中的文件
trainingFileList = listdir('trainingDigits') #load the training set
#check the len of trainingFileList
m = len(trainingFileList)
#每行数据存储一个图像
trainingMat = zeros((m,1024))
for i in range(m):
#get one name of trainingFileList,ex:0_17.txt
fileNameStr = trainingFileList[i]
#get"0_17";
fileStr = fileNameStr.split('.')[0] #split函数,去除'.',然后将剩余两侧元素分为一行二列的
#向量,然后[0]得到第一列,即0_17
#get"0"
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = listdir('testDigits') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
#以下为对测试数据的相同处理
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
#k近邻算法
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
#算法输出与结果比较
print("the classifier came back with: %d, the real answer is\
: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print ("\nthe total number of errors is: %d" % errorCount)
print( "\nthe total error rate is: %f" % (errorCount/float(mTest)))
完整代码
#批量注释、批量取消注释 Ctrl+/
# from __future__ import print_function
from numpy import *
from os import listdir
import operator#运算符模块
import matplotlib.pyplot as plt
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0, 0.1]])
labels = ['A','A','B','B']
return group,labels
group,labels=createDataSet()
def classify0(inX, dataSet, labels, k): #inX: 待测试数据 ; dataSet: 训练样本集;labels: 样本集的标签;k近邻
dataSetSize = dataSet.shape[0] #to get the rows of the matrix
# to get the Xi-Yi of the dataSet
diffMat = tile(inX, (dataSetSize,1)) - dataSet #a=[1 2],b=[2 3];tile(a,b) to generate 2*3 matrix when
#the element all is a [1 2]
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1) #使每行的元素相加,得到测试样本与各训练样本distance**2
#axis=0,按列相加;axis=1,按行相加;
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #将distance中的元素从小到大排列,
# 提取其对应的index(索引),然后输出到 sortedDistIndicies
#声明一个dict:{key:value1,key2:value2}
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
#classCount= {'B': 2, 'A': 1},初始化后,classCount每得到一个相同的voteIlabel,就+1
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #当我们获取字典里的值的时候,一个是通过
# 键值对,即dict['key'],另一个就是dict.get()方法
# dict.get(voteIlabel,0) = 0, 此处0 to be initiated,
# 之后就没有作用了。
#items方法是可以将字典中的所有项,以列表方式返回。 iteritems方法与items方法相比作用大致相同,只是它的返回值不是列表,而是一个迭代器
#Python3 中没有iteritems函数,需要用values()代替,并用list转为列表
# sortedClassCount = sorted((key_label, value_num), key=operator.itemgetter(1), reverse=True)
#python3中无法使用iteritems,需要对上面这句话改造,我们通过得到两个list,得到出现频率最高的label
key_label=list(classCount.keys())
value_num=list(classCount.values())
#label出现频率由小到大排列,并返回索引index
sortedvalue_num_indicies = argsort(value_num)
#返回频率最大的label
return key_label[len(sortedvalue_num_indicies)-1]
# group,labels = createDataSet()
# a=classify0([0,0], group,labels,3)
# print(a)
#自己根据Python3 改正后的函数
# def file2matrix(filename): # 将数据分离为样本数据与标签
# #open a file, default: 'r'ead
# fr = open(filename)
# #一次读取所有行
# arrayOLines = fr.readlines()
# #得到行数
# numberOfLines = len(arrayOLines)
# #1000*3 zeros matrix,row-1000, column-3
# returnMat = zeros((numberOfLines,3))
# #声明
# classLabelVector = []
# classLabelVector_Value = []
# index = 0
# #逐行扫描
# for line in arrayOLines:
# #strip函数会删除头和尾的字符,中间的不会删除
# line = line.strip()
# #删除‘\t’字符,仅剩下数据,供使用
# listFromLine = line.split('\t')
# #得到前三列数据,即飞行时间,游戏,冰激凌
# returnMat[index, :] = listFromLine[0:3]
# #得到largeDoses,smallDoses,didntLike的label
# classLabelVector.append(listFromLine[-1]) #无法将largeDoses,smallDoses,didntLike
# #转换为int。基于这个思想,我们在这里将得到的行矩阵建立
# #一个数值矩阵与之对应,暂时这样处理,不合适再继续修改
# if classLabelVector[index] == 'largeDoses':
# classLabelVector_Value.append(3)
# elif classLabelVector[index] == 'smallDoses':
# classLabelVector_Value.append(2)
# else:
# classLabelVector_Value.append(1)
# index += 1
# return returnMat, classLabelVector_Value
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def autoNorm(dataSet):#得到归一化后的数据样本,最大值最小值之差,与最小值
#得到每一列的max,min
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
#initiate a zero-matrix like dataSet's shape
normDataSet = zeros(shape(dataSet))
#get the num of row in dataSet
m = dataSet.shape[0]
#init a matrix of minvals that the same rows to the dataSet, 从而使当前数据矩阵中的每个数减去最小值
normDataSet = dataSet - tile(minVals, (m,1)) #tile(matrixlike,A) :init a matrix when the shape is same to A
#meanwhile, if A is a number, the matrix is A*1, if A is (m,n),the matrix
#is m*n matrix
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
def datingClassTest():
#使用10%的数据去测试分类器
hoRatio = 0.10 # hold out 10%
#datingTestSet2.txt中标签全部变为3,2,1,而不是字符串label,所以如果不想改file2matrix()函数,应用datingTestSet.txt
#如果file2matrix()用书中原程序,可用datingTestSet.txt
datingDataMat, datingLabels = file2matrix('datingTestSet.txt') # 将数据分离为样本数据与标签
normMat, ranges, minVals = autoNorm(datingDataMat)#得到归一化后的数据样本,最大值最小值之差,与最小值
#get the num of the row
m = normMat.shape[0]
#get the test num of normMat
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
#数据前numTestVecs个为测试数据,以后为样本训练集
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) # inX: 待测试数据 ; dataSet: 训练样本集;labels: 样本集的标签;k近邻
#测试结果与真正结果对照输出
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if classifierResult != datingLabels[i]:
errorCount += 1.0
print("the total error rate is: %f"% (errorCount / float(numTestVecs)))
print(errorCount)
def img2vector(filename):#返回1*1024行向量
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
#一次只读一行
lineStr = fr.readline()
for j in range(32):
#行复制
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwLabels = []
# listdir 可以列出trainingDigits文件夹目录中的文件
trainingFileList = listdir('trainingDigits') #load the training set
#check the len of trainingFileList
m = len(trainingFileList)
#每行数据存储一个图像
trainingMat = zeros((m,1024))
for i in range(m):
#get one name of trainingFileList,ex:0_17.txt
fileNameStr = trainingFileList[i]
#get"0_17";
fileStr = fileNameStr.split('.')[0] #split函数,去除'.',然后将剩余两侧元素分为一行二列的
#向量,然后[0]得到第一列,即0_17
#get"0"
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
testFileList = listdir('testDigits') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
#以下为对测试数据的相同处理
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
#k近邻算法
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
#算法输出与结果比较
print("the classifier came back with: %d, the real answer is\
: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print ("\nthe total number of errors is: %d" % errorCount)
print( "\nthe total error rate is: %f" % (errorCount/float(mTest)))
测试
handwritingClassTest()
完成!