参考博客:超详细的机器学习python入门knn干货 (po主Jack-Cui
参考书籍:《机器学习实战》——第二章
KNN入门第二弹——手写识别系统demo
——《机器学习实战》第二章2.3 手写识别系统
这应该是机器学习里很经典的一个例子了,做法有很多,数据集也很多,研一时选修的计算机视觉的大作业也是这个。这篇博客从KNN角度进行分类。KNN详细内容见前篇
目录
1 数据集
数据集链接: https://pan.baidu.com/s/1kwBNb3o1SV_b-_Rd4_GFUQ 密码: qhff
每个数字是一个txt文件,包含32x32的二进制
trainingDigits 训练集 总计大概2000张图的txt,每个数字大概200个样本
testDigits 测试集 大概900多张图的txt
文件名格式为 数字_样本序
2 数据处理
单个文件是32x32,需要转成1x1024
'''只转单个txt'''
def img2vector(path):
img_vec = np.zeros([1,1024])
file = open(path)
readlines = file.readlines()
index = 0
'''按行读取'''
for rl in readlines:
rl_strip = rl.strip()
'''每行按位读取存入相应位置'''
for i in range(32):
img_vec[0,index*32+i] = rl[i]
index += 1
return img_vec
3 文件夹遍历
因为训练集和测试集都是一个文件夹,需要遍历文件夹来准备好训练集矩阵和测试集矩阵
遍历文件夹用到了os包的listdir()
os.listdir(path) 返回路径下的所有文件夹和文件名list
'''读取数据'''
def createDataSet(filepath):
filenames = os.listdir(filepath)
m = len(filenames)
dataSet = np.zeros([m, 1024])
lables = []
index = 0
for filename in filenames:
img_vec = img2vector(filepath + '/' + filename)
dataSet[index, :] = img_vec
# print int(filename.split('_')[0])
'''文件名提取第一位作为类别'''
lables.append(int(filename.split('_')[0]))
index += 1
return dataSet,lables
if __name__ == "__main__":
trainpath = "trainingDigits"
testpath = "testDigits"
k = 4
'''获取训练数据和测试数据'''
trainSet,train_y = createDataSet(trainpath)
testSet,test_y = createDataSet(testpath)
4 构建分类器和测试(附完整代码)
#!/usr/bin/env python
#_*_coding:utf-8_*_
import numpy as np
import operator
import os
'''只转单个txt'''
def img2vector(path):
img_vec = np.zeros([1,1024])
file = open(path)
readlines = file.readlines()
#print readlines
index = 0
for rl in readlines:
rl_strip = rl.strip()
for i in range(32):
img_vec[0,index*32+i] = rl[i]
#print img_vec[0,index*32:(index+1)*32]
index += 1
return img_vec
'''读取数据'''
def createDataSet(filepath):
filenames = os.listdir(filepath)
m = len(filenames)
dataSet = np.zeros([m, 1024])
lables = []
index = 0
for filename in filenames:
img_vec = img2vector(filepath + '/' + filename)
dataSet[index, :] = img_vec
# print int(filename.split('_')[0])
'''文件名提取第一位作为类别'''
lables.append(int(filename.split('_')[0]))
index += 1
return dataSet,lables
'''构建分类器并进行测试'''
def handWritingTest(trainSet,train_y,testSet,test_y,k):
test_num = testSet.shape[0]
countWrong = 0;
for i in range(test_num):
classResult = classify0(testSet[i,:],trainSet,train_y,k)
if(test_y[i] != classResult):
countWrong += 1
print("num.%d\t分类结果:%d\t真实类别:%d" % (i, classResult, test_y[i]))
wrongRate = countWrong / float(test_num) * 100
print("错误率:%.2f%%" % wrongRate)
def classify0(inX,dataSet,labels,k):
'''计算距离'''
diff = np.tile(inX,(dataSet.shape[0],1)) - dataSet
diff_2 = diff**2
distance = diff_2.sum(axis=1)**0.5
'''距离排序下标'''
sortIndex = distance.argsort()
#print(sortIndex)
classsify = {}
'''类别计数并排序'''
for i in range(k):
class_k = labels[sortIndex[i]]
classsify[class_k] = classsify.get(class_k,0) + 1
sortdata = sorted(classsify.items(),key=operator.itemgetter(1),reverse=True)
'''返回最多类别'''
return sortdata[0][0]
if __name__ == "__main__":
trainpath = "trainingDigits"
testpath = "testDigits"
k = 4
'''获取训练数据和测试数据'''
trainSet,train_y = createDataSet(trainpath)
testSet,test_y = createDataSet(testpath)
handWritingTest(trainSet,train_y,testSet,test_y,k)
测试结果:错误率1.48% 好像还不错