Python opencv 机器学习 5.knn pca降维 ocr识别数字 mnist数据集

# -*- coding: utf-8 -*-
from numpy import *
import numpy as np
import struct
import matplotlib.pyplot as plt
import operator

# 定义一个全局特征转换变量 这个变量是在PCA中求出的
global redEigVects


def pca(dataMat, topNfeat=9999999):
    global redEigVects
    meanVals = mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals  # remove mean
    covMat = cov(meanRemoved, rowvar=0)
    eigVals, eigVects = linalg.eig(mat(covMat))
    eigValInd = argsort(eigVals)  # sort, sort goes smallest to largest
    eigValInd = eigValInd[:-(topNfeat + 1):-1]  # cut off unwanted dimensions
    redEigVects = eigVects[:, eigValInd]  # reorganize eig vects largest to smallest
    # 得到低维度数据
    lowDDataMat = meanRemoved * redEigVects  # transform data into new dimensions
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat, reconMat


def KNN(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    sqDiffMat = diffMat ** 2
    sqDistances = sqDiffMat.sum(axis=1)  # axis=0, 表示列。axis=1, 表示行。
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]


# 读取trainingMat
filename = 'train-images.idx3-ubyte'
binfile = open(filename, 'rb')
buf = binfile.read()
index = 0
# '>IIII'使用大端法读取四个unsigned int32
magic, numImages, numRows, numColumns = struct.unpack_from('>IIII', buf, index)
index += struct.calcsize('>IIII')
'''  
# 输出大端数  
print magic  
print numImages  
print numRows  
print numColumns  
'''

# 读取labels
filename1 = 'train-labels.idx1-ubyte'
binfile1 = open(filename1, 'rb')
buf1 = binfile1.read()

index1 = 0
# '>IIII'使用大端法读取两个unsigned int32
magic1, numLabels1 = struct.unpack_from('>II', buf, index)
index1 += struct.calcsize('>II')

# 设置训练数目为2500个
trainingNumbers = 60000
# 降维后的维度为7个维度 降维后的数据为40维度
DD = 50
# 初始化traingMat
trainingMatO = zeros((trainingNumbers, 28 * 28))
# 初始化标签
trainingLabels = []

# 获取经过PCA  处理过的traingMat 和 label
# for i in range(trainingNumbers):
for i in range(trainingNumbers):
    im = struct.unpack_from('>784B', buf, index)
    index += struct.calcsize('>784B')
    im = np.array(im)
    trainingMatO[i] = im
    # 读取标签
    numtemp = struct.unpack_from('1B', buf1, index1)
    label = numtemp[0]
    index1 += struct.calcsize('1B')
    trainingLabels.append(label)

# PCA
trainingMat, reconMat = pca(trainingMatO, DD)
'''
**************************************************
'''
# 读取testMat
filename3 = 't10k-images.idx3-ubyte'
binfile3 = open(filename3, 'rb')
buf3 = binfile3.read()
index3 = 0
# '>IIII'使用大端法读取四个unsigned int32
magic3, numImages3, numRows3, numColumns3 = struct.unpack_from('>IIII', buf3, index3)
index3 += struct.calcsize('>IIII')

# 读取labels
filename4 = 't10k-labels.idx1-ubyte'
binfile4 = open(filename4, 'rb')
buf4 = binfile4.read()

index4 = 0
# '>IIII'使用大端法读取两个unsigned int32
magic4, numLabels4 = struct.unpack_from('>II', buf4, index4)
index4 += struct.calcsize('>II')

'''
**************************************************
'''
# 测试数据
testNumbers = 300
# 测试维度
errCount = 0
# 获取经过PCA  处理过的testMat 和 label
for i in range(testNumbers):
    im3 = struct.unpack_from('>784B', buf3, index3)
    index3 += struct.calcsize('>784B')
    im3 = np.array(im3)

    # 新进来的数据 进行降维处理
    meanVals = mean(im3, axis=0)
    meanRemoved = im3 - meanVals  # remove mean
    # 这个时候使用的降维特征变量为上边给训练数组得出的特征量
    testingMat = meanRemoved * redEigVects

    # 读取标签
    numtemp4 = struct.unpack_from('1B', buf4, index4)
    label4 = numtemp4[0]
    index4 += struct.calcsize('1B')
    # .getA() 函数的意思是 获取该矩阵 好像PCA算法返回的是一个对象 所以此处提取了一下矩阵数组
    classifierResult = KNN(testingMat.getA(), trainingMat.getA(), trainingLabels, 10)
    print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, label4))

    if classifierResult is not label4:
        errCount = errCount + 1
print('the err rate is ', float(errCount) / testNumbers)

 

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值