



  • 实现compute_distances_two_loops
  def compute_distances_two_loops(self, X): #NO.1 实现
    Compute the distance between each test point in X and each training point
    in self.X_train using a nested loop(嵌套循环) over both the training data and the 
    test data.

    - X: A numpy array of shape (num_test, D) containing test data.

    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      is the Euclidean distance(欧几里得距离) between the ith test point and the jth training
    num_test = X.shape[0] #eg: 500
    num_train = self.X_train.shape[0] #eg: 5000
    dists = np.zeros((num_test, num_train)) #eg: 500*5000
    for i in xrange(num_test):
      for j in xrange(num_train):
        # TODO:                                                             #
        # Compute the l2 distance between the ith test point and the jth    #
        # training point, and store the result in dists[i, j]. You should   #
        # not use a loop over dimension.                                    #
        dists[i,j] = np.sqrt(np.sum(np.square(self.X_train[j,:]-X[i,:]))) # 一行一行地计算
        #                       END OF YOUR CODE                            #
    return dists

  def compute_distances_one_loop(self, X):
    Compute the distance between each test point in X and each training point
    in self.X_train using a single loop over the test data.

    Input / Output: Same as compute_distances_two_loops
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train))
    for i in xrange(num_test):
      # TODO:                                                               #
      # Compute the l2 distance between the ith test point and all training #
      # points, and store the result in dists[i, :].                        #
      dists[i,:]=np.sqrt(np.sum(np.square(self.X_train-X[i,:]),axis=1)) # 一个矩阵减某行来计算
      #                         END OF YOUR CODE                            #
    return dists

# Open cs231n/classifiers/k_nearest_neighbor.py and implement
# compute_distances_two_loops.

# Test your implementation:
dists = classifier.compute_distances_two_loops(X_test)

(500, 5000)


  • 实现predict_labels,判断测试集的标签(类别)
  def predict_labels(self, dists, k=1):
    Given a matrix of distances between test points and training points,
    predict a label for each test point.

    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.

    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    num_test = dists.shape[0] #eg: 500
    y_pred = np.zeros(num_test) #eg: 500*1
    for i in xrange(num_test):
      # A list of length k storing the labels of the k nearest neighbors to
      # the ith test point.
      closest_y = []
      # TODO:                                                                 #
      # Use the distance matrix to find the k nearest neighbors of the ith    #
      # testing point, and use self.y_train to find the labels of these       #
      # neighbors. Store these labels in closest_y.                           #
      # Hint: Look up the function numpy.argsort.                             #
      closest_y = self.y_train[np.argsort(dists[i,:])[:k]] #找到距离最小的K个,把它对应的y值记录进closest_y[]
      # TODO:                                                                 #
      # Now that you have found the labels of the k nearest neighbors, you    #
      # need to find the most common label in the list closest_y of labels.   #
      # Store this label in y_pred[i]. Break ties by choosing the smaller     #
      # label.                                                                #
      y_pred[i] = np.argmax(np.bincount(closest_y)) # 把出现次数最多的y值即为y_pred[i]
      #                           END OF YOUR CODE                            # 

    return y_pred
# Now implement the function predict_labels and run the code below:
# We use k = 1 (which is Nearest Neighbor).
y_test_pred = classifier.predict_labels(dists, k=1)

# Compute and print the fraction of correctly predicted examples
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

Got 137 / 500 correct => accuracy: 0.274000


y_test_pred = classifier.predict_labels(dists, k=5)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

Got 139 / 500 correct => accuracy: 0.278000


  • 实现compute_distances_one_loops
# Now lets speed up distance matrix computation by using partial vectorization
# with one loop. Implement the function compute_distances_one_loop and run the
# code below:
dists_one = classifier.compute_distances_one_loop(X_test)

# To ensure that our vectorized implementation is correct, we make sure that it
# agrees with the naive implementation. There are many ways to decide whether
# two matrices are similar; one of the simplest is the Frobenius norm. In case
# you haven't seen it before, the Frobenius norm of two matrices is the square
# root of the squared sum of differences of all elements; in other words, reshape
# the matrices into vectors and compute the Euclidean distance between them.
difference = np.linalg.norm(dists - dists_one, ord='fro')
print('Difference was: %f' % (difference, ))
if difference < 0.001:
    print('Good! The distance matrices are the same')
    print('Uh-oh! The distance matrices are different')

Difference was: 0.000000
Good! The distance matrices are the same


  • 实现compute_distances_no_loops
  def compute_distances_no_loops(self, X):
    Compute the distance between each test point in X and each training point
    in self.X_train using no explicit loops.

    Input / Output: Same as compute_distances_two_loops
    num_test = X.shape[0]
    num_train = self.X_train.shape[0]
    dists = np.zeros((num_test, num_train)) 
    # TODO:                                                                 #
    # Compute the l2 distance between all test points and all training      #
    # points without using any explicit loops, and store the result in      #
    # dists.                                                                #
    #                                                                       #
    # You should implement this function using only basic array operations; #
    # in particular you should not use functions from scipy.                #
    #                                                                       #
    # HINT: Try to formulate the l2 distance using matrix multiplication    #
    #       and two broadcast sums.                                         #
    # 以矩阵方式进行计算
    dists = np.multiply(np.dot(X,self.X_train.T),-2) # -2*Xte(500*3072)*Xtr(3072*5000)
    dists = np.sum(np.square(X),axis=1).reshape(num_test,1) + dists
    dists = np.sum(np.square(self.X_train), axis=1).reshape(1,num_train) + dists
    dists = np.sqrt(dists)
    #                         END OF YOUR CODE                              #
    return dists

# Now implement the fully vectorized version inside compute_distances_no_loops
# and run the code
dists_two = classifier.compute_distances_no_loops(X_test)

# check that the distance matrix agrees with the one we computed before:
difference = np.linalg.norm(dists - dists_two, ord='fro')
print('Difference was: %f' % (difference, ))
if difference < 0.001:
    print('Good! The distance matrices are the same')
    print('Uh-oh! The distance matrices are different')

Difference was: 0.000000
Good! The distance matrices are the same




# Let's compare how fast the implementations are
def time_function(f, *args):
    Call a function f with args and return the time (in seconds) that it took to execute.
    import time
    tic = time.time()
    toc = time.time()
    return toc - tic

two_loop_time = time_function(classifier.compute_distances_two_loops, X_test)
print('Two loop version took %f seconds' % two_loop_time)

one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)
print('One loop version took %f seconds' % one_loop_time)

no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)
print('No loop version took %f seconds' % no_loop_time)

# you should see significantly faster performance with the fully vectorized implementation


Two loop version took 34.215833 seconds
One loop version took 40.725616 seconds
No loop version took 0.347246 seconds


num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
# TODO:                                                                        #
# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                                #

# array_split 函数把矩阵分为了num_folds份,都存放在list中!!注意list是一个序列,不是矩阵!!

X_train_folds = np.array_split(X_train, num_folds, axis=0) #以列分为num_folds份
Y_train_folds = np.array_split(y_train, num_folds, axis=0) #以列分为num_folds份

#                                 END OF YOUR CODE                             #

# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}

# TODO:                                                                        #
# Perform k-fold cross validation to find the best value of k. For each        #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times,   #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all     #
# values of k in the k_to_accuracies dictionary.                               #

for k in k_choices:
    accuracy = [] # 存储一个k的num_folds次 正确率
    for i in range(num_folds):
        X_test_final = X_train_folds[i] # 把list中的第i份拿出来,拿出来的是一个矩阵!!!
        #np.vstack 是把各矩阵以垂直方向合在一起
        X_train_final = np.vstack(X_train_folds[:i]+X_train_folds[i+1:]) #把list中的非i份拿出来,并合在一起形成一个矩阵
        Y_test_final = Y_train_folds[i] # 把list中的第i份拿出来,拿出来的是一个矩阵!!!
        #np.hstack 是把各矩阵以水平方向合在一起
        Y_train_final = np.hstack(Y_train_folds[:i]+Y_train_folds[i+1:]) #把list中的非i份拿出来,并合在一起形成一个矩阵

        classifier.train(X_train_final, Y_train_final) # 训练数据集
        dists_cv = classifier.compute_distances_no_loops(X_test_final) # 计算测试集与训练集的L2距离
        Y_test_pre = classifier.predict_labels(dists_cv,k) # 计算出测试集在KNN分类器下所得的标签
        num_correct = np.sum(Y_test_final == Y_test_pre) # 计算所得标签的正确个数
        accu = float(num_correct)* num_folds / num_training #计算正确率
        accuracy.append(accu) # 一个循环就是把num_folds的正确率放入accuracy
    k_to_accuracies[k]=accuracy # k-accuracy 某个k对应的num_folds次正确率放入 k_to_accuracies

#                                 END OF YOUR CODE                             #

# Print out the computed accuracies
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d, accuracy = %f' % (k, accuracy))


k = 1, accuracy = 0.263000
k = 1, accuracy = 0.257000
k = 1, accuracy = 0.264000
k = 1, accuracy = 0.278000
k = 1, accuracy = 0.266000
k = 3, accuracy = 0.239000
k = 3, accuracy = 0.249000
k = 3, accuracy = 0.240000
k = 3, accuracy = 0.266000
k = 3, accuracy = 0.254000
k = 5, accuracy = 0.248000
k = 5, accuracy = 0.266000
k = 5, accuracy = 0.280000
k = 5, accuracy = 0.292000
k = 5, accuracy = 0.280000
k = 8, accuracy = 0.262000
k = 8, accuracy = 0.282000
k = 8, accuracy = 0.273000
k = 8, accuracy = 0.290000
k = 8, accuracy = 0.273000
k = 10, accuracy = 0.265000
k = 10, accuracy = 0.296000
k = 10, accuracy = 0.276000
k = 10, accuracy = 0.284000
k = 10, accuracy = 0.280000
k = 12, accuracy = 0.260000
k = 12, accuracy = 0.295000
k = 12, accuracy = 0.279000
k = 12, accuracy = 0.283000
k = 12, accuracy = 0.280000
k = 15, accuracy = 0.252000
k = 15, accuracy = 0.289000
k = 15, accuracy = 0.278000
k = 15, accuracy = 0.282000
k = 15, accuracy = 0.274000
k = 20, accuracy = 0.270000
k = 20, accuracy = 0.279000
k = 20, accuracy = 0.279000
k = 20, accuracy = 0.282000
k = 20, accuracy = 0.285000
k = 50, accuracy = 0.271000
k = 50, accuracy = 0.288000
k = 50, accuracy = 0.278000
k = 50, accuracy = 0.269000
k = 50, accuracy = 0.266000
k = 100, accuracy = 0.256000
k = 100, accuracy = 0.270000
k = 100, accuracy = 0.263000
k = 100, accuracy = 0.256000
k = 100, accuracy = 0.263000


# plot the raw observations
for k in k_choices:
    accuracies = k_to_accuracies[k]
    plt.scatter([k] * len(accuracies), accuracies)

# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.ylabel('Cross-validation accuracy')



# Based on the cross-validation results above, choose the best value for k,   
# retrain the classifier using all the training data, and test it on the test
# data. You should be able to get above 28% accuracy on the test data.
best_k = 10 #基于上图,得到最好的K值是10

classifier = KNearestNeighbor()
classifier.train(X_train, y_train)
y_test_pred = classifier.predict(X_test, k=best_k)

# Compute and display the accuracy
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
Got 141 / 500 correct => accuracy: 0.282000
