KNN分类CiFAR-10数据集Python实现

一、K近邻算法(KNN)概述

KNN是通过测量不同特征值之间的距离进行分类。它的思路是:如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别,则该样本也属于这个类别,其中K通常是不大于20的整数。

优点:精度高、对异常值不敏感、无数据输入假定。

缺点:时间复杂度高、空间复杂度高

在利用KNN对cifar10的测试机进行predict时,依次计算每个图像在坐标系上与其他图像的距离运行时间非常长。我们巧妙地通过点积运算,加快训练速度。

二、Python实现

我们将代码部分分为以下三个方面。如果想要完整的代码内容,请直接到本文的结尾部分。

1. 加载CIFAR-10数据集。我们将数据集展成二维数据,并将训练集中的1000个数据作为验证集,用来验证其准确率。

import pickle
import os
import numpy as np
def load_CIFAR_batch(filename):
    with open(filename, 'rb') as fo:
        d= pickle.load(fo, encoding='bytes')
        X=d[b'data']
        Y=d[b'labels']

        X=X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
        Y=np.array(Y)
        return X, Y

def load_CIFAR10(ROOT):
    xs=[]
    ys=[]

    for b in range(1,6):
        f=os.path.join(ROOT, "data_batch_%d" % (b, ))###
        X, Y=load_CIFAR_batch(f)
        xs.append(X) #add to list, like[array([1, 2, 3]), array([4, 5])]
        ys.append(Y)

    X_train=np.concatenate(xs)  #transform to np,like [1 2 3 4 5]
    Y_train=np.concatenate(ys)

    del X, Y
    X_test, Y_test=load_CIFAR_batch(os.path.join(ROOT, "test_batch"))
    return X_train, Y_train, X_test, Y_test


X_train, Y_train, X_test, Y_test = load_CIFAR10('./cifar10/cifar-10-batches-py/') 

# # 把32*32*3的多维数组展平
Xtr_rows = X_train.reshape(X_train.shape[0], 32 * 32 * 3) # Xtr_rows : 50000 x 3072
Xte_rows = X_test.reshape(X_test.shape[0], 32 * 32 * 3) # Xte_rows : 10000 x 3072
Xval_rows = Xtr_rows[:1000, :] # 验证集
Yval = Y_train[:1000]
Xtr_rows = Xtr_rows[1000:, :] # 保留49000的训练集
Ytr = Y_train[1000:]

2. KNN分类器。使用numpy的广播机制和点积运算,加快knn训练速度。

class KNearestNeighbor(object):
    def __init__(self):
        pass

    def train(self, X, y):
        self.X_train = X
        self.y_train = y
        
    
    def predict(self, X, k=1):
        dists = self.compute_distances_no_loops(X)
        return self.predict_labels(dists, k=k)

    def compute_distances_no_loops(self, X):
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train)) 
        #L2距离展开 -> (x-y)^2 = x^2+y^2-2xy,我们使用等式右边的形式,利用numpy的广播机制以及点积运算来加快运行速度  
        dists = np.sqrt(-2*np.dot(X, self.X_train.T) + np.sum(np.square(self.X_train), axis = 1) + np.transpose([np.sum(np.square(X), axis = 1)]))
    
        return dists

    def predict_labels(self, dists, k=1):
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            closest_y = []  
            closest_y = self.y_train[np.argsort(dists[i])[:k]]
            y_pred[i] = np.argmax(np.bincount(closest_y))

        return y_pred

3.训练及分类

knn = KNearestNeighbor()
knn.train(Xtr_rows,Ytr)
# 观察当k为3,5,7,10和20时,其准确率大小
for k in [3,5,7,10,20]:
    Yte_predict = knn.predict(Xval_rows,k,num_loops=0) # 预测
    print ('k=%d'%(k),'accuracy: %f' % ( np.mean(Yte_predict == Yval) ))
print("end")

上述全部代码汇总

#获取cifar10数据集
import pickle
import os
import numpy as np
def load_CIFAR_batch(filename):
    with open(filename, 'rb') as fo:
        d= pickle.load(fo, encoding='bytes')
        X=d[b'data']
        Y=d[b'labels']

        X=X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
        Y=np.array(Y)

        return X, Y


def load_CIFAR10(ROOT):

    xs=[]
    ys=[]

    for b in range(1,6):
        f=os.path.join(ROOT, "data_batch_%d" % (b, ))###
        X, Y=load_CIFAR_batch(f)
        xs.append(X) #add to list, like[array([1, 2, 3]), array([4, 5])]
        ys.append(Y)

    X_train=np.concatenate(xs)  #transform to np,like [1 2 3 4 5]
    Y_train=np.concatenate(ys)

    del X, Y

    X_test, Y_test=load_CIFAR_batch(os.path.join(ROOT, "test_batch"))

    return X_train, Y_train, X_test, Y_test

# KNN分类器
class KNearestNeighbor(object):
    def __init__(self):
        pass

    def train(self, X, y):
        self.X_train = X
        self.y_train = y
        
    
    def predict(self, X, k=1):
        dists = self.compute_distances_no_loops(X)
        return self.predict_labels(dists, k=k)

    def compute_distances_no_loops(self, X):
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train)) 
        #L2距离展开 -> (x-y)^2 = x^2+y^2-2xy,我们使用等式右边的形式,利用numpy的广播机制以及点积运算来加快运行速度  
        dists = np.sqrt(-2*np.dot(X, self.X_train.T) + np.sum(np.square(self.X_train), axis = 1) + np.transpose([np.sum(np.square(X), axis = 1)]))
    
        return dists

    def predict_labels(self, dists, k=1):
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            closest_y = []  
            closest_y = self.y_train[np.argsort(dists[i])[:k]]
            y_pred[i] = np.argmax(np.bincount(closest_y))

        return y_pred

#获取CIFAR10数据集
X_train, Y_train, X_test, Y_test = load_CIFAR10('./cifar10/cifar-10-batches-py/') 
# # 把32*32*3的多维数组展平
Xtr_rows = X_train.reshape(X_train.shape[0], 32 * 32 * 3) # Xtr_rows : 50000 x 3072
Xte_rows = X_test.reshape(X_test.shape[0], 32 * 32 * 3) # Xte_rows : 10000 x 3072
Xval_rows = Xtr_rows[:1000, :] # 验证集
Yval = Y_train[:1000]
Xtr_rows = Xtr_rows[1000:, :] # 保留49000的训练集
Ytr = Y_train[1000:]

# 训练及分类
knn = KNearestNeighbor()
knn.train(Xtr_rows,Ytr)
for k in [3,5,7,10,20]:
    Yte_predict = knn.predict(Xval_rows,k,num_loops=0) # 预测
    print ('k=%d'%(k),'accuracy: %f' % ( np.mean(Yte_predict == Yval) ))
print("end")

好的,下面是基于KNN算法对CIFAR-100数据集分类Python程序: 首先,需要先安装必要的库,包括numpy、pickle和cPickle,可以使用以下命令进行安装: ``` pip install numpy ``` ``` pip install pickle ``` ``` pip install cPickle ``` 接下来,我们需要加载CIFAR-100数据集,可以使用以下代码: ```python import numpy as np import pickle def unpickle(file): with open(file, 'rb') as fo: dict = pickle.load(fo, encoding='bytes') return dict def load_cifar100(): train_data = [] train_labels = [] test_data = [] test_labels = [] for i in range(1, 6): filename = 'cifar-100-python/train' + str(i) data_dict = unpickle(filename) train_data.append(data_dict[b'data']) train_labels.append(data_dict[b'fine_labels']) train_data = np.concatenate(train_data) train_labels = np.concatenate(train_labels) filename = 'cifar-100-python/test' data_dict = unpickle(filename) test_data = data_dict[b'data'] test_labels = np.array(data_dict[b'fine_labels']) return train_data, train_labels, test_data, test_labels ``` 接下来,我们需要对数据进行预处理,将像素值归一化到0到1之间,并将数据向量化。代码如下: ```python def preprocess_data(train_data, test_data): train_data = train_data.astype('float32') test_data = test_data.astype('float32') train_data /= 255 test_data /= 255 train_data = train_data.reshape(train_data.shape[0], -1) test_data = test_data.reshape(test_data.shape[0], -1) return train_data, test_data ``` 然后,我们需要定义KNN分类器。首先,我们需要计算测试数据与训练数据之间的距离,然后选择K个最近的数据点,并将它们的标签进行统计。最后,我们选择出现最频繁的标签作为预测结果。代码如下: ```python from collections import Counter class KNNClassifier: def __init__(self, k): self.k = k def fit(self, X, y): self.X_train = X self.y_train = y def predict(self, X): num_test = X.shape[0] y_pred = np.zeros(num_test, dtype = self.y_train.dtype) for i in range(num_test): distances = np.sum(np.abs(self.X_train - X[i,:]), axis=1) nearest_labels = self.y_train[np.argsort(distances)[:self.k]] c = Counter(nearest_labels) y_pred[i] = c.most_common(1)[0][0] return y_pred ``` 最后,我们可以使用以下代码进行训练和测试: ```python train_data, train_labels, test_data, test_labels = load_cifar100() train_data, test_data = preprocess_data(train_data, test_data) knn = KNNClassifier(k=5) knn.fit(train_data, train_labels) y_pred = knn.predict(test_data) accuracy = np.mean(y_pred == test_labels) print('Accuracy:', accuracy) ``` 这个程序会输出分类器的准确度。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值