KNN手写数字识别

# coding = utf-8
import numpy as np
from os import listdir


def loadDataSet():
    # 获取训练集
    print("l.Loading trainSet...")
    trainFileList = listdir('HWdigits/trainSet')
    trainNum = len(trainFileList)

    trainX = np.zeros((trainNum, 32 * 32))
    trainY = []
    for i in range(trainNum):
        trainFile = trainFileList[i]
        # 将训练数据集向量化
        trainX[i, :] = img2vector('HWdigits/trainSet/%s' % trainFile, 32, 32)
        label = int(trainFile.split('_')[0])  # 读取文件名的第一位作为标记
        trainY.append(label)
    # 获取测试数据集
    print("2.Loading testSet...")
    testFileList = listdir('HWdigits/testSet')
    testNum = len(testFileList)
    testX = np.zeros((testNum, 32 * 32))
    testY = []
    for i in range(testNum):
        testFile = testFileList[i]
        # 将测试数据集向量化
        testX[i, :] = img2vector('HWdigits/testSet/%s' % testFile, 32, 32)
        label = int(testFile.split('_')[0])
        testY.append(label)
    return trainX, trainY, testX, testY


def img2vector(filename, h, w):
    imgVector = np.zeros((1, h * w))
    fileIn = open(filename)
    for row in range(h):
        lineStr = fileIn.readline()
        for col in range(w):
            imgVector[0, row * 32 + col] = int(lineStr[col])
    return imgVector


def myKNN(testDight, trainX, trainY, k):
    numSamples = trainX.shape[0]  # shape[0]代表行,每行一个图片,得到样本个数
    # 计算欧式距离
    diff = []
    for n in range(numSamples):
        diff.append(testDight - trainX[n])  #每个个体差
    diff = np.array(diff)                   #转变为ndarray
    # 对差求平方和,然后取和的平方根
    squaredDiff = diff ** 2
    squaredDist = np.sum(squaredDiff, axis=1)
    distance = squaredDist ** 0.5
    # 按距离进行排序
    sortedDistIndoces = np.argsort(distance)
    classCount = {}  #存放各类别的个体数量

    for i in range(k):
        # 按顺序读取标签
        voteLabel = trainY[sortedDistIndoces[i]]
        # 计算该标签次数
        classCount[voteLabel] = classCount.get(voteLabel, 0) + 1

    # 查找出现次数最多的类别,作为分类结果
    maxCount = 0
    for key, value in classCount.items():
        if value > maxCount:
            maxCount = value
            maxIndex = key
    return maxIndex


train_x, train_y, test_x, test_y = loadDataSet()
numTestSamples = test_x.shape[0]
matchCount = 0
print("3.Find the most frequent label in k-nearset...")
print("4.Show the result...")
for i in range(numTestSamples):
    predict = myKNN(test_x[i], train_x, train_y, 3)
    print("result is: %d,real answer is: %d" % (predict, test_y[i]))
    if predict == test_y[i]:
        matchCount += 1
accuracy = float(matchCount) / numTestSamples
# 输出结果
print("5.Show the accuracy...")
print(" The total number of errors is: %d" % (numTestSamples - matchCount))
print(' The classify accuracy is: % .2f%%' % (accuracy * 100))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Chase℡

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值