一、K近邻算法(KNN)概述
KNN是通过测量不同特征值之间的距离进行分类。它的思路是:如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别,则该样本也属于这个类别,其中K通常是不大于20的整数。
优点:精度高、对异常值不敏感、无数据输入假定。
缺点:时间复杂度高、空间复杂度高
在利用KNN对cifar10的测试机进行predict时,依次计算每个图像在坐标系上与其他图像的距离运行时间非常长。我们巧妙地通过点积运算,加快训练速度。
二、Python实现
我们将代码部分分为以下三个方面。如果想要完整的代码内容,请直接到本文的结尾部分。
1. 加载CIFAR-10数据集。我们将数据集展成二维数据,并将训练集中的1000个数据作为验证集,用来验证其准确率。
import pickle
import os
import numpy as np
def load_CIFAR_batch(filename):
with open(filename, 'rb') as fo:
d= pickle.load(fo, encoding='bytes')
X=d[b'data']
Y=d[b'labels']
X=X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
Y=np.array(Y)
return X, Y
def load_CIFAR10(ROOT):
xs=[]
ys=[]
for b in range(1,6):
f=os.path.join(ROOT, "data_batch_%d" % (b, ))###
X, Y=load_CIFAR_batch(f)
xs.append(X) #add to list, like[array([1, 2, 3]), array([4, 5])]
ys.append(Y)
X_train=np.concatenate(xs) #transform to np,like [1 2 3 4 5]
Y_train=np.concatenate(ys)
del X, Y
X_test, Y_test=load_CIFAR_batch(os.path.join(ROOT, "test_batch"))
return X_train, Y_train, X_test, Y_test
X_train, Y_train, X_test, Y_test = load_CIFAR10('./cifar10/cifar-10-batches-py/')
# # 把32*32*3的多维数组展平
Xtr_rows = X_train.reshape(X_train.shape[0], 32 * 32 * 3) # Xtr_rows : 50000 x 3072
Xte_rows = X_test.reshape(X_test.shape[0], 32 * 32 * 3) # Xte_rows : 10000 x 3072
Xval_rows = Xtr_rows[:1000, :] # 验证集
Yval = Y_train[:1000]
Xtr_rows = Xtr_rows[1000:, :] # 保留49000的训练集
Ytr = Y_train[1000:]
2. KNN分类器。使用numpy的广播机制和点积运算,加快knn训练速度。
class KNearestNeighbor(object):
def __init__(self):
pass
def train(self, X, y):
self.X_train = X
self.y_train = y
def predict(self, X, k=1):
dists = self.compute_distances_no_loops(X)
return self.predict_labels(dists, k=k)
def compute_distances_no_loops(self, X):
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
#L2距离展开 -> (x-y)^2 = x^2+y^2-2xy,我们使用等式右边的形式,利用numpy的广播机制以及点积运算来加快运行速度
dists = np.sqrt(-2*np.dot(X, self.X_train.T) + np.sum(np.square(self.X_train), axis = 1) + np.transpose([np.sum(np.square(X), axis = 1)]))
return dists
def predict_labels(self, dists, k=1):
num_test = dists.shape[0]
y_pred = np.zeros(num_test)
for i in range(num_test):
closest_y = []
closest_y = self.y_train[np.argsort(dists[i])[:k]]
y_pred[i] = np.argmax(np.bincount(closest_y))
return y_pred
3.训练及分类
knn = KNearestNeighbor()
knn.train(Xtr_rows,Ytr)
# 观察当k为3,5,7,10和20时,其准确率大小
for k in [3,5,7,10,20]:
Yte_predict = knn.predict(Xval_rows,k,num_loops=0) # 预测
print ('k=%d'%(k),'accuracy: %f' % ( np.mean(Yte_predict == Yval) ))
print("end")
上述全部代码汇总
#获取cifar10数据集
import pickle
import os
import numpy as np
def load_CIFAR_batch(filename):
with open(filename, 'rb') as fo:
d= pickle.load(fo, encoding='bytes')
X=d[b'data']
Y=d[b'labels']
X=X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
Y=np.array(Y)
return X, Y
def load_CIFAR10(ROOT):
xs=[]
ys=[]
for b in range(1,6):
f=os.path.join(ROOT, "data_batch_%d" % (b, ))###
X, Y=load_CIFAR_batch(f)
xs.append(X) #add to list, like[array([1, 2, 3]), array([4, 5])]
ys.append(Y)
X_train=np.concatenate(xs) #transform to np,like [1 2 3 4 5]
Y_train=np.concatenate(ys)
del X, Y
X_test, Y_test=load_CIFAR_batch(os.path.join(ROOT, "test_batch"))
return X_train, Y_train, X_test, Y_test
# KNN分类器
class KNearestNeighbor(object):
def __init__(self):
pass
def train(self, X, y):
self.X_train = X
self.y_train = y
def predict(self, X, k=1):
dists = self.compute_distances_no_loops(X)
return self.predict_labels(dists, k=k)
def compute_distances_no_loops(self, X):
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
#L2距离展开 -> (x-y)^2 = x^2+y^2-2xy,我们使用等式右边的形式,利用numpy的广播机制以及点积运算来加快运行速度
dists = np.sqrt(-2*np.dot(X, self.X_train.T) + np.sum(np.square(self.X_train), axis = 1) + np.transpose([np.sum(np.square(X), axis = 1)]))
return dists
def predict_labels(self, dists, k=1):
num_test = dists.shape[0]
y_pred = np.zeros(num_test)
for i in range(num_test):
closest_y = []
closest_y = self.y_train[np.argsort(dists[i])[:k]]
y_pred[i] = np.argmax(np.bincount(closest_y))
return y_pred
#获取CIFAR10数据集
X_train, Y_train, X_test, Y_test = load_CIFAR10('./cifar10/cifar-10-batches-py/')
# # 把32*32*3的多维数组展平
Xtr_rows = X_train.reshape(X_train.shape[0], 32 * 32 * 3) # Xtr_rows : 50000 x 3072
Xte_rows = X_test.reshape(X_test.shape[0], 32 * 32 * 3) # Xte_rows : 10000 x 3072
Xval_rows = Xtr_rows[:1000, :] # 验证集
Yval = Y_train[:1000]
Xtr_rows = Xtr_rows[1000:, :] # 保留49000的训练集
Ytr = Y_train[1000:]
# 训练及分类
knn = KNearestNeighbor()
knn.train(Xtr_rows,Ytr)
for k in [3,5,7,10,20]:
Yte_predict = knn.predict(Xval_rows,k,num_loops=0) # 预测
print ('k=%d'%(k),'accuracy: %f' % ( np.mean(Yte_predict == Yval) ))
print("end")