KNN手写数字识别

最新推荐文章于 2024-10-03 09:02:12 发布

Chase℡

最新推荐文章于 2024-10-03 09:02:12 发布

阅读量172

点赞数 2

文章标签： python numpy 机器学习

本文链接：https://blog.csdn.net/qq_29786111/article/details/129974629

版权

# coding = utf-8
import numpy as np
from os import listdir


def loadDataSet():
    # 获取训练集
    print("l.Loading trainSet...")
    trainFileList = listdir('HWdigits/trainSet')
    trainNum = len(trainFileList)

    trainX = np.zeros((trainNum, 32 * 32))
    trainY = []
    for i in range(trainNum):
        trainFile = trainFileList[i]
        # 将训练数据集向量化
        trainX[i, :] = img2vector('HWdigits/trainSet/%s' % trainFile, 32, 32)
        label = int(trainFile.split('_')[0])  # 读取文件名的第一位作为标记
        trainY.append(label)
    # 获取测试数据集
    print("2.Loading testSet...")
    testFileList = listdir('HWdigits/testSet')
    testNum = len(testFileList)
    testX = np.zeros((testNum, 32 * 32))
    testY = []
    for i in range(testNum):
        testFile = testFileList[i]
        # 将测试数据集向量化
        testX[i, :] = img2vector('HWdigits/testSet/%s' % testFile, 32, 32)
        label = int(testFile.split('_')[0])
        testY.append(label)
    return trainX, trainY, testX, testY


def img2vector(filename, h, w):
    imgVector = np.zeros((1, h * w))
    fileIn = open(filename)
    for row in range(h):
        lineStr = fileIn.readline()
        for col in range(w):
            imgVector[0, row * 32 + col] = int(lineStr[col])
    return imgVector


def myKNN(testDight, trainX, trainY, k):
    numSamples = trainX.shape[0]  # shape[0]代表行，每行一个图片，得到样本个数
    # 计算欧式距离
    diff = []
    for n in range(numSamples):
        diff.append(testDight - trainX[n])  #每个个体差
    diff = np.array(diff)                   #转变为ndarray
    # 对差求平方和，然后取和的平方根
    squaredDiff = diff ** 2
    squaredDist = np.sum(squaredDiff, axis=1)
    distance = squaredDist ** 0.5
    # 按距离进行排序
    sortedDistIndoces = np.argsort(distance)
    classCount = {}  #存放各类别的个体数量

    for i in range(k):
        # 按顺序读取标签
        voteLabel = trainY[sortedDistIndoces[i]]
        # 计算该标签次数
        classCount[voteLabel] = classCount.get(voteLabel, 0) + 1

    # 查找出现次数最多的类别，作为分类结果
    maxCount = 0
    for key, value in classCount.items():
        if value > maxCount:
            maxCount = value
            maxIndex = key
    return maxIndex


train_x, train_y, test_x, test_y = loadDataSet()
numTestSamples = test_x.shape[0]
matchCount = 0
print("3.Find the most frequent label in k-nearset...")
print("4.Show the result...")
for i in range(numTestSamples):
    predict = myKNN(test_x[i], train_x, train_y, 3)
    print("result is: %d,real answer is: %d" % (predict, test_y[i]))
    if predict == test_y[i]:
        matchCount += 1
accuracy = float(matchCount) / numTestSamples
# 输出结果
print("5.Show the accuracy...")
print(" The total number of errors is: %d" % (numTestSamples - matchCount))
print(' The classify accuracy is: % .2f%%' % (accuracy * 100))