# coding = utf-8
import numpy as np
from os import listdir
def loadDataSet():
# 获取训练集
print("l.Loading trainSet...")
trainFileList = listdir('HWdigits/trainSet')
trainNum = len(trainFileList)
trainX = np.zeros((trainNum, 32 * 32))
trainY = []
for i in range(trainNum):
trainFile = trainFileList[i]
# 将训练数据集向量化
trainX[i, :] = img2vector('HWdigits/trainSet/%s' % trainFile, 32, 32)
label = int(trainFile.split('_')[0]) # 读取文件名的第一位作为标记
trainY.append(label)
# 获取测试数据集
print("2.Loading testSet...")
testFileList = listdir('HWdigits/testSet')
testNum = len(testFileList)
testX = np.zeros((testNum, 32 * 32))
testY = []
for i in range(testNum):
testFile = testFileList[i]
# 将测试数据集向量化
testX[i, :] = img2vector('HWdigits/testSet/%s' % testFile, 32, 32)
label = int(testFile.split('_')[0])
testY.append(label)
return trainX, trainY, testX, testY
def img2vector(filename, h, w):
imgVector = np.zeros((1, h * w))
fileIn = open(filename)
for row in range(h):
lineStr = fileIn.readline()
for col in range(w):
imgVector[0, row * 32 + col] = int(lineStr[col])
return imgVector
def myKNN(testDight, trainX, trainY, k):
numSamples = trainX.shape[0] # shape[0]代表行,每行一个图片,得到样本个数
# 计算欧式距离
diff = []
for n in range(numSamples):
diff.append(testDight - trainX[n]) #每个个体差
diff = np.array(diff) #转变为ndarray
# 对差求平方和,然后取和的平方根
squaredDiff = diff ** 2
squaredDist = np.sum(squaredDiff, axis=1)
distance = squaredDist ** 0.5
# 按距离进行排序
sortedDistIndoces = np.argsort(distance)
classCount = {} #存放各类别的个体数量
for i in range(k):
# 按顺序读取标签
voteLabel = trainY[sortedDistIndoces[i]]
# 计算该标签次数
classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
# 查找出现次数最多的类别,作为分类结果
maxCount = 0
for key, value in classCount.items():
if value > maxCount:
maxCount = value
maxIndex = key
return maxIndex
train_x, train_y, test_x, test_y = loadDataSet()
numTestSamples = test_x.shape[0]
matchCount = 0
print("3.Find the most frequent label in k-nearset...")
print("4.Show the result...")
for i in range(numTestSamples):
predict = myKNN(test_x[i], train_x, train_y, 3)
print("result is: %d,real answer is: %d" % (predict, test_y[i]))
if predict == test_y[i]:
matchCount += 1
accuracy = float(matchCount) / numTestSamples
# 输出结果
print("5.Show the accuracy...")
print(" The total number of errors is: %d" % (numTestSamples - matchCount))
print(' The classify accuracy is: % .2f%%' % (accuracy * 100))
KNN手写数字识别
最新推荐文章于 2024-10-03 09:02:12 发布