深度学习与图像识别原理与实践笔记

第三章  KNN实战

####KNN算法实现
import numpy as np
import matplotlib.pyplot as plt
import operator


# 给出训练数据以及对应的类别
def createDataSet():
    group = np.array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5], [1.1, 1.0], [0.5, 1.5]])
    labels = np.array(['A', 'A', 'B', 'B', 'A', 'B'])
    return group, labels

##这里的K值是固定值
def kNN_classify(k,dis,X_train,x_train,Y_test):
    assert dis == 'E' or dis == 'M', 'dis must E or M ,E代表欧式距离,M代表曼哈顿距离'
    num_test = Y_test.shape[0] #测试样本的数量
    labellist = []
    '''
    使用欧拉公式作为距离度量
    '''
    if (dis == 'E'):
        for i in range(num_test):
            #实现欧式距离公式
            distances = np.sqrt(np.sum(((X_train - np.tile(Y_test[i], (X_train.shape[0], 1))) ** 2), axis=1))
            nearest_k = np.argsort(distances) 
            #距离由小到大进行排序,并返回index值
            topK = nearest_k[:k] 
            #选取前k个距离
            classCount = {}
            for i in topK: 
                #统计每个类别的个数
                classCount[x_train[i]] = classCount.get(x_train[i],0) + 1
            sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)                
            labellist.append(sortedClassCount[0][0])
        return np.array(labellist)
    '''
    使用曼哈顿公式作为距离度量
    '''
    if (dis == 'M'):
        for i in range(num_test):
            #实现欧式距离公式
            distances = np.sum((abs(X_train - np.tile(Y_test[i], (X_train.shape[0], 1)))), axis=1)
            nearest_k = np.argsort(distances) 
            #距离由小到大进行排序,并返回index值
            topK = nearest_k[:k] 
            #选取前k个距离
            classCount = {}
            for i in topK: 
                #统计每个类别的个数
                classCount[x_train[i]] = classCount.get(x_train[i],0) + 1
            sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
            labellist.append(sortedClassCount[0][0])
        return np.array(labellist)


if __name__ == '__main__':
    group, labels = createDataSet()
    y_test_pred = kNN_classify(2, 'E', group, labels, np.array([[1.0, 2.1], [0.4, 2.0]]))
    print(y_test_pred) #输出['A' 'B']


####KNN实现MNIst数据分类
import torch
from torch.utils.data import DataLoader
import torchvision.datasets as dsets
import torchvision.transforms as transforms
batch_size = 100

# MNIST dataset
train_dataset = dsets.MNIST(root = '/ml/pymnist', train = True, transform = None, download = True) 
test_dataset = dsets.MNIST(root = '/ml/pymnist', train = False, transform = None, download = True) 

#加载数据
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,batch_size= batch_size,shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True)


print("train_data:", train_dataset.train_data.size())
print("train_labels:", train_dataset.train_labels.size())
print("test_data:", test_dataset.test_data.size())
print("test_labels:", test_dataset.test_labels.size())


if __name__ == '__main__':
    X_train = train_loader.dataset.train_data.numpy() #需要转为numpy矩阵
    X_train = X_train.reshape(X_train.shape[0],28*28)#需要reshape之后才能放入knn分类器
    y_train = train_loader.dataset.train_labels.numpy()
    X_test = test_loader.dataset.test_data[:1000].numpy()
    X_test = X_test.reshape(X_test.shape[0],28*28)
    y_test = test_loader.dataset.test_labels[:1000].numpy()
    num_test = y_test.shape[0]
    y_test_pred = kNN_classify(5, 'M', X_train, y_train, X_test)
    num_correct = np.sum(y_test_pred == y_test)
    accuracy = float(num_correct) / num_test
    print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
    #输出精度低,只有36.8%,考虑将其标准化

def getXmean(X_train):
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    #将图片从二维展开为一维
    mean_image = np.mean(X_train, axis=0)
    #求出训练集中所有图片每个像素位置上的平均值
    return mean_image

def centralized(X_test,mean_image):
    X_test = np.reshape(X_test, (X_test.shape[0], -1)) #将图片从二维展开为一维
    X_test = X_test.astype(np.float)
    X_test -= mean_image #减去均值图像,实现零均值化
    return X_test


if __name__ == '__main__':
    X_train = train_loader.dataset.train_data.numpy()
    mean_image = getXmean(X_train)
    X_train = centralized(X_train,mean_image)
    y_train = train_loader.dataset.train_labels.numpy()
    X_test = test_loader.dataset.test_data[:1000].numpy()
    X_test = centralized(X_test,mean_image)
    y_test = test_loader.dataset.test_labels[:1000].numpy()
    num_test = y_test.shape[0]
    y_test_pred = kNN_classify(5, 'M', X_train, y_train, X_test)
    num_correct = np.sum(y_test_pred == y_test)
    accuracy = float(num_correct) / num_test
    print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
    ##归一化处理后精度提高至95%


##交叉验证得到超参数K
class Knn:
    def __init__(self):
        pass
    def fit(self, X_train, y_train):
        self.Xtr = X_train
        self.ytr = y_train
    def predict(self, k, dis, X_test):
        assert dis == 'E' or dis == 'M', 'dis must E or M'
        num_test = X_test.shape[0] 
        labellist = []
        if (dis == 'E'):
            for i in range(num_test):
                distances = np.sqrt(np.sum(((self.Xtr - np.title(X_test[i], (self.Xtr.shape[0], 1))) ** 2), axis=1))
                nearest_k = np.argsort(distances)
                topK = nearest_k[:k]
                classCount = {}
                for i in topK:
                    classCount[self.ytr[i]] = classCount.get(self.ytr[i], 0) + 1
                sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
                labellist.append(sortedClassCount[0][0])
            return np.array(labellist)
        #使用曼哈顿公式作为距离度量
        if (dis == 'M'):
            for i in range(num_test):
                #按照列的方向相加,其实就是行相加
                distances = np.sum(np.abs(self.Xtr - np.tile(X_test[i], (self.Xtr.shape[0], 1))), axis=1)
                nearest_k = np.argsort(distances)
                topK = nearest_k[:k]
                classCount = {}
                for i in topK:
                    classCount[self.ytr[i]] = classCount.get(self.ytr[i], 0) + 1
                sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
                labellist.append(sortedClassCount[0][0])
            return np.array(labellist)



X_train = train_loader.datset.train_data
X_train = X_train.reshape(X_train.shape[0], -1)
mean_image = getXmean(X_train)
X_train = centralized(X_train, mean_image)
y_train = train_loader.dataset.train_labels
y_train = np.array(y_train)
X_test = test_loader.dataset.test_data
X_test = X_test.reshape(X_test.shape[0], -1)
X_test = centralized(X_test, mean_image)
y_test = test_loader.dataset.test_labels
y_test = np.array(y_test)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#将训练数据分成5个部分,每个部分轮流作为验证集
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20] #k的值一般选择1~20以内
num_training=X_train.shape[0]
X_train_folds = []
y_train_folds = []
indices = np.array_split(np.arange(num_training), ind
ices_or_sections=num_folds) 
#把下标分成5个部分
for i in indices:
    X_train_folds.append(X_train[i])
y_train_folds.append(y_train[i])
k_to_accuracies = {}
for k in k_choices:
    #进行交叉验证
    acc = []
    for i in range(num_folds):
        x = X_train_folds[0:i] + X_train_folds[i+1:] 
        #训练集不包括验证集
        x = np.concatenate(x, axis=0) 
        #使用concatenate将4个训练集拼在一起
        y = y_train_folds[0:i] + y_train_folds[i+1:]
        y = np.concatenate(y) 
        #对label进行同样的操作
        test_x = X_train_folds[i] 
        #单独拿出验证集
        test_y = y_train_folds[i]
        classifier = Knn() 
        #定义model
        classifier.fit(x, y) 
        #读入训练集
        #dist = classifier.compute_distances_no_loops(test_x)
        #计算距离矩阵
        y_pred = classifier.predict(k,'M',test_x) 
        #预测结果
        accuracy = np.mean(y_pred == test_y) 
        #计算准确率
        acc.append(accuracy)
k_to_accuracies[k] = acc 
 #计算交叉验证的平均准确率
#输出准确度
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print('k = %d, accuracy = %f' % (k, accuracy))


#图形化展示K的选取与准确度趋势
# plot the raw observations
import matplotlib.pyplot as plt
for k in k_choices:
    accuracies = k_to_accuracies[k]
    plt.scatter([k] * len(accuracies), accuracies)
# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()        
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值