最近邻算法比较简单,但是样本集多了之后比较的次数会非常多,所以我这里选择的是用均值算每一个类的模板类,通过和模板类对比来识别手写字
#单模板分类器,加快分类速度
import cv2 as cv
import numpy as np
import time
def getdata(path):
img = cv.imread(path,0)#参数0为读成灰度图片
return cv.resize(img,(15,15))#修规图片大小
def change(m):
"""把一个数据矩阵变成一行"""
M = []
for i in range(m.shape[1]):
temp = m[:][i]
M=M+list(temp)
return M
#K近邻分类器
def KNNClassfy(X, S, K):
#一行为一个样本,一列为一个像素
#X,S均为numpy矩阵
x = X.shape[0]
y = S.shape[0]
dist =np.ones((x,y))
for i in range(x):
for j in range(y):
a = X[i]#识别样本
b = S[j]#训练样本
dist[i,j] = np.sqrt(np.sum(np.square(a - b)))
T = np.argsort(dist, axis=1)[:,0:K]
num_id = []
for i in range(x):
result = -np.array([-1]*K)
for j in range(K):
result[j] = T[i,j]
_,n=sorted([(np.sum(result==i),i) for i in set(result)])[-1]
num_id.append(n)
return num_id
#通用路径
train_path = "D:\\MyWorkPlace\\pattern recognition\\images4000"
test_path = "D:\\MyWorkPlace\\pattern recognition\\images_test1000"
#训练集载入
print("开始载入数据集!",end = " ")
my_train = []
for i in range(10):
tmp = []
for j in range(400):
img = getdata(train_path+f'\\{i}_{j}.bmp')
tmp.append(change(img))
tmp_arr = np.array(tmp)
print("\r训练集已载入:{:0.2f}%".format((i*400+j+1)/40),end=" ")
my_train.append(np.mean(tmp_arr, axis=0))
S = np.array(my_train)
#测试集载入
my_test = []
for i in range(10):
for j in range(401,501):
img = getdata(test_path+f'\\{i}_{j}.bmp')
my_test.append(change(img))
print("\r测试集已载入:{:0.2f}%".format((i*100+j-400)*100/1000),end=" ")
print("\r数据集已载入{:0.2f}%,载入完毕!".format((i*100+j-400)*100/1000))
X = np.array(my_test)
#K取1即最近邻分类器
time_start = time.time() #开始计时
result = KNNClassfy(X,S,1)#分类器输出标签
#创建正确标签true_result
true_result = []
for i in range(10):
true_result = true_result+[i]*100
count = 0
for i in range(len(result)):
if result[i]==true_result[i]:
count = count+1
else:
print("预测为%2s 实际为%2s"%(result[i],true_result[i]))
print("正确率:",count/len(result))
time_end = time.time()
print("用时{:0.2f}s".format(time_end-time_start))