一、MNIST数据集
MNIST数据集是机器学习领域中非常经典的一个数据集,由60000个训练样本和10000个测试样本组成,每个样本都是一张28 * 28像素的灰度手写数字图片。
二、KNN算法
KNN算法,最近邻算法,即将待测试的数据每个特征和训练样本数据的每个特征进行比较,然后提取k个最邻近的训练样本数据,统计这k个训练样本数据的分类标签,其中出现次数最多的标签所表示的类别就是待测试数据的类别。KNN的含义就是k个最相近的“邻居”。
三、KNN实现MNIST数据分类
载入MNIST数据集
batch_size = 100
path ='./'
train_datasets = datasets.MNIST(root=path, #选择数据的根目录
train = True, # 选择训练集
transform = None, #不考虑使用任何数据预处理
download = True) # 从网络上download图片
test_datasets = datasets.MNIST(root=path,
train=False,
transform = None, #不考虑使用任何数据预处理
download=True)
# 加载数据
train_loader = DataLoader(train_datasets, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_datasets, batch_size=batch_size, shuffle=True)
对训练数据和测试数据进行处理,为提高识别率,首先将数据转化为numpy矩阵,并将28*28的矩阵转化为一维行向量,然后对数据进行归一化处理。
# 对训练数据处理
x_train = train_loader.dataset.data.numpy()
# 归一化处理
mean_image = getXmean(x_train)
x_train = centralized(x_train, mean_image)
y_train = train_loader.dataset.targets.numpy()
# 对测试数据处理,取前num_test个测试数据
num_test = 200
x_test = test_loader.dataset.data[:num_test].numpy()
mean_image = getXmean(x_test)
x_test = centralized(x_test, mean_image)
y_test = test_loader.dataset.targets[:num_test].numpy()
KNN算法具体实现,KNN算法中考虑到距离公式有欧拉公式和曼哈顿公式,因此分别为实现了这两个公式。利用距离公式得到测试数据到每个训练数据的距离,然后将这些数据根据距离进行升序排序,选择前k个数据。利用这k个数据所对应的类别,得到测试数据所属的类别。
def predict(self, k, dis, X_test):
assert dis == 'E' or dis == 'M','dis must E or M,E代表欧拉距离,M代表曼哈顿距离'
num_test = X_test.shape[0]
label_list = []
# 使用欧拉公式作为距离测量
if dis == 'E':
for i in range(num_test):
distances = np.sqrt(np.sum(((self.Xtr - np.tile(X_test[i],
(self.Xtr.shape[0], 1)))) ** 2, axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
class_count = {}
for i in topK:
class_count[self.ytr[i]] = class_count.get(self.ytr[i], 0) + 1
sorted_class_count = sorted(class_count.items(), key=lambda elem: elem[1], reverse=True)
label_list.append(sorted_class_count[0][0])
return np.array(label_list)
# 使用曼哈顿公式进行度量
if dis == 'M':
for i in range(num_test):
distances = np.abs(np.sum(((self.Xtr - np.tile(X_test[i],
(self.Xtr.shape[0], 1)))), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
class_count = {}
for i in topK:
class_count[self.ytr[i]] = class_count.get(self.ytr[i], 0) + 1
sorted_class_count = sorted(class_count.items(), key=lambda elem: elem[1], reverse=True)
label_list.append(sorted_class_count[0][0])
return np.array(label_list)
四、运行结果
五、具体代码
Utils.py
import numpy as np
def getXmean(x_train):
x_train = np.reshape(x_train, (x_train.shape[0], -1)) # 将28*28像素展开成一个一维的行向量
mean_image = np.mean(x_train, axis=0) # 求每一列均值。即求所有图片每一个像素上的平均值
return mean_image
def centralized(x_test, mean_image):
x_test = np.reshape(x_test, (x_test.shape[0], -1))
x_test = x_test.astype(np.float)
x_test -= mean_image #减去平均值,实现均一化。
return x_test
class Knn:
def __init__(self):
pass
def fit(self, X_train, y_train):
self.Xtr = X_train
self.ytr = y_train
def predict(self, k, dis, X_test):
assert dis == 'E' or dis == 'M','dis must E or M,E代表欧拉距离,M代表曼哈顿距离'
num_test = X_test.shape[0]
label_list = []
# 使用欧拉公式作为距离测量
if dis == 'E':
for i in range(num_test):
distances = np.sqrt(np.sum(((self.Xtr - np.tile(X_test[i],
(self.Xtr.shape[0], 1)))) ** 2, axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
class_count = {}
for i in topK:
class_count[self.ytr[i]] = class_count.get(self.ytr[i], 0) + 1
sorted_class_count = sorted(class_count.items(), key=lambda elem: elem[1], reverse=True)
label_list.append(sorted_class_count[0][0])
return np.array(label_list)
# 使用曼哈顿公式进行度量
if dis == 'M':
for i in range(num_test):
distances = np.abs(np.sum(((self.Xtr - np.tile(X_test[i],
(self.Xtr.shape[0], 1)))), axis=1))
nearest_k = np.argsort(distances)
topK = nearest_k[:k]
class_count = {}
for i in topK:
class_count[self.ytr[i]] = class_count.get(self.ytr[i], 0) + 1
sorted_class_count = sorted(class_count.items(), key=lambda elem: elem[1], reverse=True)
label_list.append(sorted_class_count[0][0])
return np.array(label_list)
Mnist_knn.py
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from Utils import *
batch_size = 100
path ='./'
train_datasets = datasets.MNIST(root=path, #选择数据的根目录
train = True, # 选择训练集
transform = None, #不考虑使用任何数据预处理
download = True) # 从网络上download图片
test_datasets = datasets.MNIST(root=path,
train=False,
transform = None, #不考虑使用任何数据预处理
download=True)
# 加载数据
train_loader = DataLoader(train_datasets, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_datasets, batch_size=batch_size, shuffle=True)
# 对训练数据处理
x_train = train_loader.dataset.data.numpy()
# 归一化处理
mean_image = getXmean(x_train)
x_train = centralized(x_train, mean_image)
y_train = train_loader.dataset.targets.numpy()
# 对测试数据处理,取前num_test个测试数据
num_test = 200
x_test = test_loader.dataset.data[:num_test].numpy()
mean_image = getXmean(x_test)
x_test = centralized(x_test, mean_image)
y_test = test_loader.dataset.targets[:num_test].numpy()
print("train_data:",x_train.shape)
print("train_label:",len(y_train))
print("test_data:",x_test.shape)
print("test_labels:",len(y_test))
#利用KNN计算识别率
for k in range(1, 6, 2): #不同K值计算识别率
classifier = Knn()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(k, 'E', x_test)
num_correct = np.sum(y_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct when k= %d => accuracy: %f' % (num_correct, num_test, k, accuracy))
参考1: https://developer.aliyun.com/article/726957.
参考2: https://blog.csdn.net/qq_44761480/article/details/103321983.