载入数据
KNN分类
from numpy import *
import operator
import os
# KNN分类核心方法
def kNNClassify(newInput, dataSet, labels, k): #(test_x[i],train_x,train_y,3)
numSamples = dataSet.shape[0] # shape[0]代表数据集的行数
#欧式距离
# tile(A, reps): 将A重复reps次来构造一个矩阵
diff = tile(newInput, (numSamples, 1)) - dataSet # 差
squaredDiff = diff ** 2 # 差值的平方
squaredDist = sum(squaredDiff, axis = 1) # 差值的平方和
distance = squaredDist ** 0.5 #1/2次方
#街市距离
#diff = tile(newInput,(numSamples,1)) - dataSet
#absDiff = fabs(diff)
#absDist = sum(absDiff,axis = 1)
#distance = absDist
# # step 2: 对距离排序
# argsort()返回排序后的索引
sortedDistIndices = argsort(distance)
classCount = {} # 定义一个空的字典
for i in range(k):
# # step 3: 选择k个最小距离
voteLabel = labels[sortedDistIndices[i]]
# # step 4: 计算类别的出现次数
# when the key voteLabel is not in dictionary classCount, get()
# will return 0
classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
# # step 5: 返回出现次数最多的类别作为分类结果
maxCount = 0
for key, value in classCount.items():
if value > maxCount:
maxCount = value
maxIndex = key
return maxIndex
图片转换为向量
# 将图片转换为向量
def img2vector(filename):
rows = 32
cols = 32
imgVector = zeros((1, rows * cols))
fileIn = open(filename)
for row in range(rows):
lineStr = fileIn.readline()
for col in range(cols):
imgVector[0, row * 32 + col] = int(lineStr[col])
return imgVector
加载数据集
# 加载数据集
def loadDataSet():
# # step 1: 读取训练数据集
print ("---Getting training set...")
dataSetDir = 'D:/practice/digits/'
trainingFileList = os.listdir(dataSetDir + 'trainingDigits') # 加载测试数据
numSamples = len(trainingFileList) #training文件夹里有多少文件
train_x = zeros((numSamples, 1024)) #train_x是一个文件数x1024的数组
train_y = []
for i in range(numSamples):
filename = trainingFileList[i] #training文件夹里的文件名
# get train_x
train_x[i, :] = img2vector(dataSetDir + 'trainingDigits/%s' % filename) #将一个txt文件转化为向量,存入train_x的第i行
# get label from file name such as "1_18.txt"
label = int(filename.split('_')[0]) # 用'_'分隔开文件名,取前面的部分为标签,如"1_18.txt",返回1
train_y.append(label) #train_y加入取得的label
# # step 2:读取测试数据集
print("---Getting testing set...")
testingFileList = os.listdir(dataSetDir + 'testDigits') # load the testing set
numSamples = len(testingFileList)
test_x = zeros((numSamples, 1024))
test_y = []
for i in range(numSamples):
filename = testingFileList[i]
# get train_x
test_x[i, :] = img2vector(dataSetDir + 'testDigits/%s' % filename)
# get label from file name such as "1_18.txt"
label = int(filename.split('_')[0]) # return 1
test_y.append(label)
return train_x, train_y, test_x, test_y
主实现函数
def testHandWritingClass():
# # step 1: 加载数据
print("step 1: load data...")
train_x, train_y, test_x, test_y = loadDataSet()
# # step 2: 模型训练.
print("step 2: training...")
pass
# # step 3: 测试
print("step 3: testing...")
numTestSamples = test_x.shape[0] #测试向量个数
matchCount = 0 #匹配数
for i in range(numTestSamples):
predict = kNNClassify(test_x[i], train_x, train_y, 7) #对test_x的每行向量,train_x作为数据集,train_y作为标签集,进行K=3的knn分类
if predict == test_y[i]:
matchCount += 1
else:
print("The predict is:",predict," but the test is ",test_y[i])
accuracy = float(matchCount) / numTestSamples
# # step 4: 输出结果
print("step 4: show the result...")
print(('The classify accuracy is: %.2f%%')% (accuracy * 100))
结果比较
K=7
K=5
K=3
可见当K值越小,由K个最近邻类别投票的结果就越精确