一、k_nearest_neighbor.py 部分:
1.两层循环
def compute_distances_two_loops(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using a nested loop over both the training data and the
test data.
Inputs:
- X: A numpy array of shape (num_test, D) containing test data.
Returns:
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
is the Euclidean distance between the ith test point and the jth training
point.
"""
num_test = X.shape[0] #X_train 5000,3072 X_test 500 3072
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
for j in range(num_train):
#####################################################################
# TODO: #
# Compute the l2 distance between the ith test point and the jth #
# training point, and store the result in dists[i, j]. You should #
# not use a loop over dimension, nor use np.linalg.norm(). #
#####################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
dists[i,j]=np.sqrt(np.sum(np.square(self.X_train[j]-X[i])))
#dists[i][j] = np.linalg.norm(X[i] - self.X_train[j])
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dists
2.一层循环
def compute_distances_one_loop(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using a single loop over the test data.
Input / Output: Same as compute_distances_two_loops
"""
num_test = X.shape[0] #X_train 5000,3072 X_test 500 3072
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
#######################################################################
# TODO: #
# Compute the l2 distance between the ith test point and all training #
# points, and store the result in dists[i, :]. #
# Do not use np.linalg.norm(). #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
dists[i,:]=np.sqrt(np.sum(np.square(X[i]-self.X_train),axis=1))
#dists[i,:] = np.sqrt(np.sum(np.square(X[i]-self.X_train),axis=1)).T#dists :500*5000 为什么不需要转置?
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dists
3.不用循环
def compute_distances_no_loops(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using no explicit loops.
Input / Output: Same as compute_distances_two_loops
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
#########################################################################
# TODO: #
# Compute the l2 distance between all test points and all training #
# points without using any explicit loops, and store the result in #
# dists. #
# #
# You should implement this function using only basic array operations; #
# in particular you should not use functions from scipy, #
# nor use np.linalg.norm(). #
# #
# HINT: Try to formulate the l2 distance using matrix multiplication #
# and two broadcast sums. #
#########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
dists=np.sqrt(-2*X.dot(self.X_train.T)+np.sum(np.square(X),axis=1).reshape(-1,1)
+np.sum(np.square(self.X_train),axis=1))
#np.sum()函数,无论axis取何值,得到的都是一个行向量,故需要转置,
# dists = np.sqrt(np.sum(np.square(X),axis=1)+np.sum(np.square(X_train),axis=1)-2*X.dot(self.X_train.T)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dists
二、knn.ipynb 部分:
交叉验证
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
################################################################################
# TODO: #
# Split up the training data into folds. After splitting, X_train_folds and #
# y_train_folds should each be lists of length num_folds, where #
# y_train_folds[i] is the label vector for the points in X_train_folds[i]. #
# Hint: Look up the numpy array_split function. #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
y_train=y_train.reshape(-1,1)
X_train_folds=np.array_split(X_train,num_folds)
y_train_folds=np.array_split(y_train,num_folds)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}
for k in k_choices:
k_to_accuracies.setdefault(k,[])
################################################################################
# TODO: #
# Perform k-fold cross validation to find the best value of k. For each #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times, #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all #
# values of k in the k_to_accuracies dictionary. #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
for i in range(num_folds):
classifier = KNearestNeighbor()
x_val_train = np.vstack(X_train_folds[0:i]+X_train_folds[i+1:]) #挑选[0,i-1]U[i+1,num_folds-1]作为训练集,第i项作为测试集
y_val_train = np.vstack(y_train_folds[0:i]+y_train_folds[i+1:])
y_val_train=y_val_train[:,0] #将(4000,1)变为(4000,)
classifier.train(x_val_train, y_val_train)
dists = classifier.compute_distances_no_loops(X_train_folds[i])
for k in k_choices:
y_val_pred = classifier.predict_labels(dists, k=k)
num_correct = np.sum(y_val_pred == y_train_folds[i][:,0]) #y_train_folds[i]shape:(1000,1) y_train_folds[i][:,0]shape:(1000,)
accuracy = float(num_correct) / len(y_val_pred)
k_to_accuracies[k]=k_to_accuracies[k]+[accuracy] #字典赋值,每个k对应一个accuracy
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#Print out the computed accuracies
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print('k = %d, accuracy = %f' % (k, accuracy))