KNN学习笔记

刷cs231n的作业,废话不说,直接贴代码,注释的比较详细了

代码中若有难理解的numpy函数,详见我的另一篇博客

import numpy as np

class KNearestNeighbor(object):
    
    def __init__(self):
        self.X_train = X
        self.y_train = y
       
    #训练只需要记住训练数据即可
    def train(self,X,y):    #X_train:(5000, 3072)   y_train:(5000,)
        self.X_train = X
        self.y_train = y
#三种计算欧氏距离的方法吗,分别用1,2次循环。no_loops使用向量计算,最快
        
    def predict(self,X,k=1,num_loops=0):
        if num_loops == 0:
            dists = self.compute_distances_no_loops(X)
        elif num_loops == 1:
            dists = self.compute_distances_one_loop(X)
        elif num_loops == 2:
            dists = self.compute_distances_two_loops(X)
        else:
            raise ValueError('Invalid value %d for num_loops' % num_loops)
        return self.predict_labels(dists, k=k)    
    
    def compute_distances_two_loops(self, X):   ##dists: (500, 5000)
        num_test = X.shape[0]   #500
        num_train = self.X_train.shape[0]   #5000
        dists = np.zeros((num_test,num_train))
        for i in xrange(num_test):
            for j in xrange*(num_train):
                dists[i][j] = np.sqrt(np.sum(np.square(X[i]-self.X_train[j]))) #求测试样本的每一个数据与所有训练数据的欧氏距离
        return dists   
    
    def compute_distances_one_loops(self, X):   #X:(500,3072)
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test,num_train))   #(500,5000)
        for i in xrange(num_test):
            dists[i] = np.sqrt(np.sum(np.square(self.X_train-X[i]), axis=1))  #每个test样本与5000个训练样本距离平方后,
            #按行求和,得到(5000,)即与每个训练样本的欧氏距离
        return dists
    
    def compute_distances_no_loops(self, X):
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test,num_train))
        dists = np.sqrt(-2*np.dot(X, self.X_train.T)+
                        np.sum(np.square(self.X_train),axis=1)+np.transpose([np.sum(np.square(X),axis=1)]))
#这里利用numpy的broadcasting性质,例如A=[1,2], B=[[3],[4]], A+B=[[3+1,3+2],[4+1,4+2]]。 
# 以及(a-b)^2=a^2+b^2-2ab
# numpy中sum的参数keepdims用于保留原有累加维度。 

# test_sum = np.sum(np.square(X_test),axis=1,keepdims=True)  
#或者用这种方法:test_sum=[np.sum(np.square(a-b[2]), axis=1)]  test_sum=np.transpose(c)  这俩结果都是(500,1)
# train_sum=np.sum(np.square(X_train),axis=1) #(5000,)
# test_mul_train=np.matmul(X_test,X_train.T)    #(500了,5000)每一项包含3072个交叉项
# dists = test_sum + train_sum - 2*test_mul_train   #test_sum + train_sum 就是(500,5000),由(a-b)^2=a^2+b^2-2ab,就是L2距离

    
    def predict_labels(self, dists, k=1):
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)
        for i in xrange(num_test):
            closest_y = []
# closest_y得是1维非负,np.squeeze()从数组的形状中删除单维条目,即把shape中为1的维度去掉                     
# bincount(...)   
#     bincount(x, weights=None, minlength=0)
#  Parameters  ----------
#     x : array_like, 1 dimension, nonnegative intsInput array.

            closest_y = self.y_train[np.argsort(dists[i])[:k]]   #每一行从小到大排序,取前k个索引
           y_pred[i] = np.argmax(np.bincount(np.squeeze(closest_y)))  #对训练数据中的5000个样本,拿到前k个最近邻样本,
            #取出现次数最多的一个标签,np.bincount的索引就是0~9,值也是标签
        return y_pred    #500x1

测试用cifar10,包含交叉验证测试最佳k值

import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
#修改为本地数据集
cifar10_dir = '/Users/user/Desktop/jupterCode/data/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

# As a sanity check, we print out the size of the training and test data.
print 'Training data shape: ', X_train.shape
print 'Training labels shape: ', y_train.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape

# Subsample the data for more efficient code execution in this exercise
num_training = 5000
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 500
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
print(X_train.shape)   #(5000, 32, 32, 3)
print(X_train[0].shape)   #(32, 32, 3)
print(y_train.shape) #(5000,)

# Reshape the image data into rows
print(X_train.shape)   #(5000, 32, 32, 3)
X_train = np.reshape(X_train, (X_train.shape[0], -1)) #5000x32x32x3 ——> 5000x3072
X_test = np.reshape(X_test, (X_test.shape[0], -1))
print X_train.shape, X_test.shape  #(5000, 3072) (500, 3072)
print(X_train[0])   # [ 59.  62.  63. ... 123.  92.  72.](3072)

from cs231n.classifiers import KNearestNeighbor

# Create a kNN classifier instance. 
# Remember that training a kNN classifier is a loop: 
# the Classifier simply remembers the data and does no further processing 
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)  # X_train:(5000, 3072)   y_train:(5000,)

# Open cs231n/classifiers/k_nearest_neighbor.py and implement
# compute_distances_two_loops.

# Test your implementation:
dists = classifier.compute_distances_two_loops(X_test)
print dists.shape  #(500, 5000)

# Now implement the function predict_labels and run the code below:
# We use k = 1 (which is Nearest Neighbor).
y_test_pred = classifier.predict_labels(dists, k=1)

# Compute and print the fraction of correctly predicted examples
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)

Cross-validation

We have implemented the k-Nearest Neighbor classifier but we set the value k = 5 arbitrarily. We will now determine the best value of this hyperparameter with cross-validation. 在机器学习中,当数据量不是很充足时,交叉验证是一种不错的模型选择方法(深度 学习数据量要求很大,一般是不采用交叉验证的,因为它太费时间),本节我们就利 用交叉验证来选择最好的 k 值来获得较好的预测的准确率。 这里,我们采用S折交叉验证的方法,即将数据平均分成S份,一份作为测试集,其余 作为训练集,一般S=10,本文将S设为5,即代码中的 num_folds=5

num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []

# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                
# pass
y_train_ = y_train.reshape(-1, 1)
X_train_folds , y_train_folds = np.array_split(X_train, 5), np.array_split(y_train_, 5) #(1000,3071) (1000,1)


# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.

k_to_accuracies = {}   #以字典形式存储 k 和 accuracy 



# Perform k-fold cross validation to find the best value of k. For each        #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times,   #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all     #
# values of k in the k_to_accuracies dictionary. 

for k_ in k_choices:
    k_to_accuracies.setdefault(k_, [])
for i in range(num_folds):    #对每个 k 值,选取一份测试,其余训练,计算准确率
    classifier = KNearestNeighbor()
    X_val_train = np.vstack(X_train_folds[0:i] + X_train_folds[i+1:])  #表示除i之外的作为训练集 (4000, 3072)
    y_val_train = np.vstack(y_train_folds[0:i] + y_train_folds[i+1:])   #(4000, 1)
    y_val_train = y_val_train[:,0]   #第一列 (4000,)的一维数组
    classifier.train(X_val_train, y_val_train)
    for k_ in k_choices:
        y_val_pred = classifier.predict(X_train_folds[i], k=k_)
        num_correct = np.sum(y_val_pred == y_train_folds[i][:,0])  #第i份作为测试集并预测
        accuracy = float(num_correct) / len(y_val_pred)
        k_to_accuracies[k_] = k_to_accuracies[k_] + [accuracy]


# Print out the computed accuracies
for k in sorted(k_to_accuracies):   #表示输出每次得到的准确率以及每个k值对应的 平均准确率
    #sorted(iterable[, cmp[, key[, reverse]]])
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
# plot the raw observations
for k in k_choices:
  accuracies = k_to_accuracies[k]
  plt.scatter([k] * len(accuracies), accuracies)

# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()
# Based on the cross-validation results above, choose the best value for k,   
# retrain the classifier using all the training data, and test it on the test
# data. You should be able to get above 28% accuracy on the test data.
best_k = 10
print(X_test.shape)
print(X_train.shape)
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)
y_test_pred = classifier.predict(X_test, k=best_k)

# Compute and display the accuracy
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print 'Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值