第三章 KNN实战
####KNN算法实现
import numpy as np
import matplotlib.pyplot as plt
import operator
# 给出训练数据以及对应的类别
def createDataSet():
group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
return group, labels
##这里的K值是固定值
def kNN_classify(k,dis,X_train,x_train,Y_test):
assert dis == 'E' or dis == 'M', 'dis must E or M ,E代表欧式距离,M代表曼哈顿距离'
num_test = Y_test.shape[0] #测试样本的数量
labellist = []
'''
使用欧拉公式作为距离度量
'''
if (dis == 'E'):
for i in range(num_test):
#实现欧式距离公式
distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
#距离由小到大进行排序,并返回index值
topK = nearest_k[:k]
#选取前k个距离
classCount = {}
for i in topK:
#统计每个类别的个数
classCount[x_train[i]] = classCount.get(x_train[i],0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
'''
使用曼哈顿公式作为距离度量
'''
if (dis == 'M'):
for i in range(num_test):
#实现欧式距离公式
distances = np.sum((abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1)))), axis=1)
nearest_k = np.argsort(distances)
#距离由小到大进行排序,并返回index值
topK = nearest_k[:k]
#选取前k个距离
classCount = {}
for i in topK:
#统计每个类别的个数
classCount[x_train[i]] = classCount.get(x_train[i],0) + 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
if __name__ == '__main__':
group, labels = createDataSet()
y_test_pred = kNN_classify(2, 'E', group, labels, np.array([[1.0, 2.1], [0.4, 2.0]]))
print(y_test_pred) #输出['A' 'B']
####KNN实现MNIst数据分类
import torch
from torch.utils.data import DataLoader
import torchvision.datasets as dsets
import torchvision.transforms as transforms
batch_size = 100
# MNIST dataset
train_dataset = dsets.MNIST(root = '/ml/pymnist', train = True, transform = None, download = True)
test_dataset = dsets.MNIST(root = '/ml/pymnist', train = False, transform = None, download = True)
#加载数据
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,batch_size= batch_size,shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True)
print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())
if __name__ == '__main__':
X_train = train_loader.dataset.train_data.numpy() #需要转为numpy矩阵
X_train = X_train.reshape(X_train.shape[0],28*28)#需要reshape之后才能放入knn分类器
y_train = train_loader.dataset.train_labels.numpy()
X_test = test_loader.dataset.test_data[:1000].numpy()
X_test = X_test.reshape(X_test.shape[0],28*28)
y_test = test_loader.dataset.test_labels[:1000].numpy()
num_test = y_test.shape[0]
y_test_pred = kNN_classify(5, 'M', X_train, y_train, X_test)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
#输出精度低,只有36.8%,考虑将其标准化
def getXmean(X_train):
X_train = np.reshape(X_train, (X_train.shape[0], -1))
#将图片从二维展开为一维
mean_image = np.mean(X_train, axis=0)
#求出训练集中所有图片每个像素位置上的平均值
return mean_image
def centralized(X_test,mean_image):
X_test = np.reshape(X_test, (X_test.shape[0], -1)) #将图片从二维展开为一维
X_test = X_test.astype(np.float)
X_test -= mean_image #减去均值图像,实现零均值化
return X_test
if __name__ == '__main__':
X_train = train_loader.dataset.train_data.numpy()
mean_image = getXmean(X_train)
X_train = centralized(X_train,mean_image)
y_train = train_loader.dataset.train_labels.numpy()
X_test = test_loader.dataset.test_data[:1000].numpy()
X_test = centralized(X_test,mean_image)
y_test = test_loader.dataset.test_labels[:1000].numpy()
num_test = y_test.shape[0]
y_test_pred = kNN_classify(5, 'M', X_train, y_train, X_test)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
##归一化处理后精度提高至95%
##交叉验证得到超参数K
class Knn:
def __init__(self):
pass
def fit(self, X_train, y_train):
self.Xtr = X_train
self.ytr = y_train
def predict(self, k, dis, X_test):
assert dis == 'E' or dis == 'M', 'dis must E or M'
num_test = X_test.shape[0]
labellist = []
if (dis == 'E'):
for i in range(num_test):
distances = np.sqrt(np.sum(((self.Xtr - np.title(X_test[i], (self.Xtr.shape[0], 1))) ** 2), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[self.ytr[i]] = classCount.get(self.ytr[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
#使用曼哈顿公式作为距离度量
if (dis == 'M'):
for i in range(num_test):
#按照列的方向相加,其实就是行相加
distances = np.sum(np.abs(self.Xtr - np.tile(X_test[i], (self.Xtr.shape[0], 1))), axis=1)
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
classCount = {}
for i in topK:
classCount[self.ytr[i]] = classCount.get(self.ytr[i], 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
labellist.append(sortedClassCount[0][0])
return np.array(labellist)
X_train = train_loader.datset.train_data
X_train = X_train.reshape(X_train.shape[0], -1)
mean_image = getXmean(X_train)
X_train = centralized(X_train, mean_image)
y_train = train_loader.dataset.train_labels
y_train = np.array(y_train)
X_test = test_loader.dataset.test_data
X_test = X_test.reshape(X_test.shape[0], -1)
X_test = centralized(X_test, mean_image)
y_test = test_loader.dataset.test_labels
y_test = np.array(y_test)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
#将训练数据分成5个部分,每个部分轮流作为验证集
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20] #k的值一般选择1~20以内
num_training=X_train.shape[0]
X_train_folds = []
y_train_folds = []
indices = np.array_split(np.arange(num_training), ind
ices_or_sections=num_folds)
#把下标分成5个部分
for i in indices:
X_train_folds.append(X_train[i])
y_train_folds.append(y_train[i])
k_to_accuracies = {}
for k in k_choices:
#进行交叉验证
acc = []
for i in range(num_folds):
x = X_train_folds[0:i] + X_train_folds[i+1:]
#训练集不包括验证集
x = np.concatenate(x, axis=0)
#使用concatenate将4个训练集拼在一起
y = y_train_folds[0:i] + y_train_folds[i+1:]
y = np.concatenate(y)
#对label进行同样的操作
test_x = X_train_folds[i]
#单独拿出验证集
test_y = y_train_folds[i]
classifier = Knn()
#定义model
classifier.fit(x, y)
#读入训练集
#dist = classifier.compute_distances_no_loops(test_x)
#计算距离矩阵
y_pred = classifier.predict(k,'M',test_x)
#预测结果
accuracy = np.mean(y_pred == test_y)
#计算准确率
acc.append(accuracy)
k_to_accuracies[k] = acc
#计算交叉验证的平均准确率
#输出准确度
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print('k = %d, accuracy = %f' % (k, accuracy))
#图形化展示K的选取与准确度趋势
# plot the raw observations
import matplotlib.pyplot as plt
for k in k_choices:
accuracies = k_to_accuracies[k]
plt.scatter([k] * len(accuracies), accuracies)
# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()