统计学习方法_kNN实现

数据集与上一篇文章不同,可以使用完整的MNIST数据集了,下载地址:MNIST

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import time
import cv2
from sklearn.cross_validation import train_test_split

# 提取hog特征,784 -> 324
def get_hog_features(trainset):
    features = []

    hog = cv2.HOGDescriptor('./hog.xml')  # 读取配置文件

    for image in trainset:
        image = image.reshape(28, 28)
        cv_img = image.astype(np.uint8)  # uint8范围为0-255,和像素数值范围相同

        hog_feature = hog.compute(cv_img)
        features.append(hog_feature)

    features = np.array(features)
    features = features.reshape(-1, 324)  # 第一维任意,第二维为提取到的特征18*18

    return features

def Predict(testset, trainset, train_labels):
    predict = []
    count = 0

    # test_vec shape (D,)
    for test_vec in testset:
        print(count, end=" ")  # 输出测试用例的下标
        count += 1
        if count % 100 == 0:
            print()

        # 当前k个最近邻
        knn_list = []
        # 当前k个最近邻中最远点的坐标
        max_index = -1
        # 当前k个最近邻中最远点的距离
        max_dist = 0

        # 先将前k个训练数据放入knn_list中,填充满
        for i in range(k):
            label = train_labels[i]
            train_vec = trainset[i]  # shape (D,)

            # 计算欧式距离
            dist = np.linalg.norm(train_vec - test_vec)

            knn_list.append((dist, label))

        # 处理剩下的点
        for i in range(k, len(train_labels)):
            label = train_labels[i]
            train_vec = trainset[i]

            dist = np.linalg.norm(train_vec - test_vec)

            # 寻找10个邻近点中距离最远的点
            if max_index < 0:
                for j in range(k):
                    if max_dist < knn_list[j][0]:
                        max_index = j
                        max_dist = knn_list[j][0]

            # 如果当前k个最近邻中存在距离比当前点远,则替换
            if dist < max_dist:
                knn_list[max_index] = (dist, label)
                # 别忘记重新初始化,因为替换后10个邻近点中
                max_index = -1
                max_dist = 0

        # 从10个最近邻中统计选票
        class_label = [0 for i in range(k)]  # list快速赋初值方式
        for dist, label in knn_list:
            class_label[label] += 1

        # 选出最大选票对应的选票数
        mmax = max(class_label)

        for i in range(k):
            if mmax == class_label:
                predict.append(i)
                break

    return np.array(predict)

k = 10  # 可由交叉验证获得最佳的k

if __name__ == '__main__':
    print('Start reading data:')

    time1 = time.time()

    raw_data = pd.read_csv('./data/train.csv')
    data = raw_data.values

    img = data[:, 1:]
    labels = data[:, 0]

    print(img.shape)
    print(labels.shape)

    features = get_hog_features(img)
    print(features.shape)

    train_features, test_features, train_labels, test_labels = train_test_split(features, labels,test_size=0.33, random_state=11111)

    time2 = time.time()
    print('read data cost %f seconds' % (time2 - time1))

    print('Starting training:')
    print('knn do not need to train!')
    time3 = time.time()
    print('training cost %f seconds' % (time3 - time2))

    print('Starting predicting:')
    test_predict = Predict(test_features, train_features, train_labels)
    time4 = time.time()
    print('predicting cost %f seconds' % (time4 - time3))

    accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
    print('The accuracy is %f!' % accuracy)

'''
Start reading data:
(42000, 784)
(42000,)
(42000, 324)
read data cost 6.009209 seconds
Starting training:
knn do not need to train!
training cost 0.000033 seconds
Starting predicting:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
to be continued
运行时间过长不算出accuracy了
'''

 

手写数字识别是机器学习中经典的问题之一,KNN(K-最近邻)算法是一种常用的分类算法。下面给出基于Python的手写数字识别KNN算法的实现过程。 1. 数据集准备 首先,我们需要一个手写数字的数据集。MNIST数据集是一个经典的手写数字数据集,可以从http://yann.lecun.com/exdb/mnist/下载。下载后,将数据集解压缩到本地文件夹中。 2. 数据预处理 将数据集中的图片转换成向量形式,以便于计算机处理。这里我们将每张图片转换成一个784维的向量(28*28像素),并将像素值归一化到[0,1]范围内。 ```python import os import struct import numpy as np def load_mnist(path, kind='train'): labels_path = os.path.join(path, '%s-labels-idx1-ubyte' % kind) images_path = os.path.join(path, '%s-images-idx3-ubyte' % kind) with open(labels_path, 'rb') as lbpath: magic, n = struct.unpack('>II', lbpath.read(8)) labels = np.fromfile(lbpath, dtype=np.uint8) with open(images_path, 'rb') as imgpath: magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16)) images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784) return images, labels X_train, y_train = load_mnist('mnist/', kind='train') X_test, y_test = load_mnist('mnist/', kind='t10k') # 数据归一化 X_train = X_train / 255.0 X_test = X_test / 255.0 ``` 3. KNN算法实现 KNN算法的基本思路是:对于一个未知样本,计算它与训练集中每个样本的距离,选择距离最近的K个样本,将它们的标签作为预测结果。 ```python from collections import Counter def knn(X_train, y_train, X_test, k): pred_labels = [] for i in range(len(X_test)): # 计算测试样本与训练集样本的距离 distances = np.sqrt(np.sum((X_train - X_test[i])**2, axis=1)) # 选择距离最近的K个样本 nearest = np.argsort(distances)[:k] # 统计K个样本的标签 counter = Counter(y_train[nearest]) # 将出现次数最多的标签作为预测结果 pred_labels.append(counter.most_common(1)[0][0]) return pred_labels ``` 4. 测试效果 将KNN算法应用到手写数字识别问题上,测试其效果。 ```python pred_labels = knn(X_train, y_train, X_test[:100], 5) accuracy = np.sum(pred_labels == y_test[:100]) / len(y_test[:100]) print('Accuracy:', accuracy) ``` 输出结果如下: ``` Accuracy: 0.97 ``` 可以看出,KNN算法在手写数字识别问题上的表现还是比较不错的。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值