说来惭愧,这应该是很早以前就完成的任务了,但第一次做非常不认真,这次重刷一遍,记录如下。
knn还是很熟悉的,毕竟本科毕设就用过,但用numpy实现还是有些难度的。
knn部分
import numpy as np
class KNearestNeighbor(object):
""" a kNN classifier with L2 distance """
def __init__(self):
pass
def train(self, X, y):
"""
Train the classifier. For k-nearest neighbors this is just
memorizing the training data.
Inputs:
- X: A numpy array of shape (num_train, D) containing the training data
consisting of num_train samples each of dimension D.
- y: A numpy array of shape (N,) containing the training labels, where
y[i] is the label for X[i].
"""
self.X_train = X
self.y_train = y
def predict(self, X, k=1, num_loops=0):
"""
Predict labels for test data using this classifier.
Inputs:
- X: A numpy array of shape (num_test, D) containing test data consisting
of num_test samples each of dimension D.
- k: The number of nearest neighbors that vote for the predicted labels.
- num_loops: Determines which implementation to use to compute distances
between training points and testing points.
Returns:
- y: A numpy array of shape (num_test,) containing predicted labels for the
test data, where y[i] is the predicted label for the test point X[i].
"""
if num_loops == 0:
dists = self.compute_distances_no_loops(X)
elif num_loops == 1:
dists = self.compute_distances_one_loop(X)
elif num_loops == 2:
dists = self.compute_distances_two_loops(X)
else:
raise ValueError('Invalid value %d for num_loops' % num_loops)
return self.predict_labels(dists, k=k)
def compute_distances_two_loops(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using a nested loop over both the training data and the
test data.
Inputs:
- X: A numpy array of shape (num_test, D) containing test data.
Returns:
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
is the Euclidean distance between the ith test point and the jth training
point.
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
for j in range(num_train):
#####################################################################
# TODO: #
# Compute the l2 distance between the ith test point and the jth #
# training point, and store the result in dists[i, j]. You should #
# not use a loop over dimension. #
#####################################################################
dists[i][j] = np.sqrt(np.sum(np.square(X[i,:] - self.X_train[j,:])))
# dists[i,j] 中保存的是第i个测试点和第j个训练点之间的距离。
# X(500*3072) 500是测试集的大小,3072表示每一张照片是3072维的向量
# dists 500*5000 5000为训练集的大小
#####################################################################
# END OF YOUR CODE #
#####################################################################
return dists
def compute_distances_one_loop(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using a single loop over the test data.
Input / Output: Same as compute_distances_two_loops
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
#######################################################################
# TODO: #
# Compute the l2 distance between the ith test point and all training #
# points, and store the result in dists[i, :]. #
#######################################################################
dists[i,:] = np.sqrt(np.sum(np.square(X[i,:] - self.X_train),axis = 1))
#由于X是500*3072维的向量,需要加的是axis=1的部分,即3072那一部分。
#对于每个test point,分别计算它和所有test point的距离并对axis求和。
#######################################################################
# END OF YOUR CODE #
#######################################################################
return dists
def compute_distances_no_loops(self, X):
"""
Compute the distance between each test point in X and each training point
in self.X_train using no explicit loops.
Input / Output: Same as compute_distances_two_loops
"""
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
#########################################################################
# TODO: #
# Compute the l2 distance between all test points and all training #
# points without using any explicit loops, and store the result in #
# dists. #
# #
# You should implement this function using only basic array operations; #
# in particular you should not use functions from scipy. #
# #
# HINT: Try to formulate the l2 distance using matrix multiplication #
# and two broadcast sums. #
#########################################################################
#运用了一个矩阵公式来计算test样本集和training样本集的距离矩阵
dists = np.multiply(np.dot(X,self.X_train.T),-2)
# self.X_train 5000*3072
res1 = np.sum(np.square(X),axis = 1,keepdims = True)
#这里要将keepdims=True,使计算后依然为维度是测试样本数量的列向量
res2 = np.sum(np.square(self.X_train),axis = 1)
dists = np.sqrt(np.add(np.add(dists,res1),res2))
#########################################################################
# END OF YOUR CODE #
#########################################################################
return dists
def predict_labels(self, dists, k=1):
"""
Given a matrix of distances between test points and training points,
predict a label for each test point.
Inputs:
- dists: A numpy array of shape (num_test, num_train) where dists[i, j]
gives the distance betwen the ith test point and the jth training point.
Returns:
- y: A numpy array of shape (num_test,) containing predicted labels for the
test data, where y[i] is the predicted label for the test point X[i].
"""
num_test = dists.shape[0]
y_pred = np.zeros(num_test)
for i in range(num_test):
# A list of length k storing the labels of the k nearest neighbors to
# the ith test point.
closest_y = []
#########################################################################
# TODO: #
# Use the distance matrix to find the k nearest neighbors of the ith #
# testing point, and use self.y_train to find the labels of these #
# neighbors. Store these labels in closest_y. #
# Hint: Look up the function numpy.argsort. #
#########################################################################
closest_y = self.y_train[np.argsort(dists[i])[:k]]
#一个长度为k的列表,它存储了对于第i个测试点的k个最近邻居的下标。
#########################################################################
# TODO: #
# Now that you have found the labels of the k nearest neighbors, you #
# need to find the most common label in the list closest_y of labels. #
# Store this label in y_pred[i]. Break ties by choosing the smaller #
# label. #
#########################################################################
y_pred[i] = np.argmax(np.bincount(closest_y))
#y_pred 1*500
#即closest_y中出现次数最多的那个数字的序号即为预测
#########################################################################
# END OF YOUR CODE #
#########################################################################
return y_pred
cross validation部分:
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
################################################################################
# TODO: #
# Split up the training data into folds. After splitting, X_train_folds and #
# y_train_folds should each be lists of length num_folds, where #
# y_train_folds[i] is the label vector for the points in X_train_folds[i]. #
# Hint: Look up the numpy array_split function. #
################################################################################
X_train_folds = np.array_split(X_train,num_folds)
y_train_folds = np.array_split(y_train,num_folds)
################################################################################
# END OF YOUR CODE #
################################################################################
# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}
#将不同k值下的准确率保存在一个字典中。交叉验证之后,k_to_accuracies[k]保存了一个长度为折数的list,值为k值下的准确率.
################################################################################
# TODO: #
# Perform k-fold cross validation to find the best value of k. For each #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times, #
# where in each case you use all but one of the folds as training data and the #
# last fold as a validation set. Store the accuracies for all fold and all #
# values of k in the k_to_accuracies dictionary. #
################################################################################
#对于每一个k值,执行kNN算法num_folds次,每一次执行中,选择一折为验证集,其它折为训练集。
#将不同k值在不同折上的验证结果保存在k_to_accuracies字典中。
classifier = KNearestNeighbor()
for k in k_choices:
accuracies = np.zeros(num_folds)
for fold in range(num_folds):
temp_X = X_train_folds[:]
temp_y = y_train_folds[:]
X_validate_fold = temp_X.pop(fold)
y_validate_fold = temp_y.pop(fold)
temp_X = np.array([y for x in temp_X for y in x])
temp_y = np.array([y for x in temp_y for y in x])
classifier.train(temp_X,temp_y)
y_test_pred = classifier.predict(X_validate_fold,k)
num_correct = np.sum(y_test_pred == y_validate_fold)
accuracy = num_correct/num_test
accuracies[fold] = accuracy
k_to_accuracies[k] = accuracies
################################################################################
# END OF YOUR CODE #
################################################################################
# Print out the computed accuracies
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print('k = %d, accuracy = %f' % (k, accuracy))
Note
- x.shape 没有括号
- axis的用法
通过指定不同的axis,numpy会沿着不同的方向进行操作,如果不设置,则表示对所有的元素进行操作,如果axis=0,则沿着纵轴进行操作,若axis=1则沿着横轴进行操作。但是这只是仅仅对于二维数组而言。但是可以总结为一句话:设axis=i ,则numpy沿着第i个下标变化的方向 进行操作。
最直观的:函数所选的axis的值,就表明 x[][][] 的第几个方块号,从0开始,代表第一个[ ],即x[ ] [ ] [ ]
参考:https://blog.csdn.net/xiongchengluo1129/article/details/79062991 - keepdims
keepdims主要用于保持矩阵的二维特性
参考:https://blog.csdn.net/u012560212/article/details/78393836 - argsort
argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y - argmax
argmax返回的是最大数的索引 - bincount
每个bin给出了它的索引值在x中出现的次数
# 我们可以看到x中最大的数为7,因此bin的数量为8,那么它的索引值为0->7
x = np.array([0, 1, 1, 3, 2, 1, 7])
# 索引0出现了1次,索引1出现了3次......索引5出现了0次......
np.bincount(x)
#因此,输出结果为:array([1, 3, 1, 1, 0, 0, 0, 1])