k近邻算法-3.算法应用

算法具体应用

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn import datasets

加载手写数据集

digits = datasets.load_digits()  #加载手写数据集

1465169-20190713094406085-1686757288.png

手写数据集共有5620个样本,每个样本有64个特征,为手写数据集的像素点,其样本的结果为0-9的手写数字,其数据集描述如下:
1465169-20190713094418685-387920369.png

样本结构:
1465169-20190713094458604-2118026381.png

数据可视化,查看某个样本的特征和结果:

x =digits.data
y = digits.target
# 数据集中第222个样本
some_digit = x[222]

#一个手写数字有64个特征,将一维数组的特征变为8*8的矩阵
some_digit_image = some_digit.reshape(8, 8)  
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary)
plt.show()

1465169-20190713094551697-2055952211.png

查看此数据的结果:
1465169-20190713094603684-1524186202.png

封装之前的代码,实现手写数据集的预测

定义K近邻算法(KNN.py):

import numpy as np
from math import sqrt
from collections import Counter


class KNNClassifier:
    """docstring for KNNClassifier"""
    def __init__(self, k):
        """初始化KNN分类器"""
        assert k >= 1, 'k must be valid'
        self.k = k
        self._x_train = None
        self._y_train = None

    def fit(self, _x_train, _y_train):
        """根据训练数据集训练KNN分类器"""
        self._x_train = _x_train
        self._y_train = _y_train
        # 返回对象本身(高级操作)
        return self

    def predict(self,x_predict):
        """给定待测试的数据集x_predict,返回结果向量"""
        assert self._x_train is not None and self._y_train is not None, \
            "must fit before predict!"
        assert self._x_train.shape[0] == self._y_train.shape[0], \
            "the size of x_train must equal to the size of y_train" 
        assert self._x_train.shape[1] == x_predict.shape[1], \
            "the feature number of x must be equal to x_train"

        y_predict = [self._predict(x) for x in x_predict]
        return np.array(y_predict)

    def _predict(self, x):
        """给定单个的待测数据x,返回x的预测结果"""
        assert self._x_train.shape[1] == x.shape[0], \
            "the feature number of x must be equal to x_train"
        #求出一个预测的数据 和 每个数据集的距离,是一个无序列表
        distances = [sqrt(np.sum((x_train -x) ** 2)) for x_train in self._x_train]

        #根据索引排序
        nearest = np.argsort(distances)

        #找出距离此新样本最近的k个原始样本的结果
        topK_y = [self._y_train[i] for i in nearest[:self.k]]

        #统计数组中的元素,及它出现的次数
        votes = Counter(topK_y)
        #找到票数最多的n个元素 ,按票数从多到少 排序 [(元素,票数)]
        return votes.most_common()[0][0]

    def __repr__(self):
        return 'KNN(k=%d)'%self.k
        

定义模型选择库(model_selection.py)

import numpy as np

#训练 测试数据集分离
def train_test_split(x, y, test_ratio=0.2, seed=None):
    assert x.shape[0] == y.shape[0],\
        "the size of x must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0,\
        "test_ratio must be valid"

    if seed:
        np.random.seed(seed)

    shuffle_index = np.random.permutation(len(x))

    test_size = int(len(x) * test_ratio)

    test_index = shuffle_index[:test_size]
    train_index = shuffle_index[test_size:]

    x_train = x[train_index]
    x_test = x[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    return x_train, x_test, y_train, y_test

使用自己封装的库:

from mylib.model_selection import train_test_split
from mylib.KNN import KNNClassifier
    
x_train,x_test,y_train,y_test = train_test_split(x, y,test_ratio=0.2)
my_clf = KNNClassifier(k=3)
my_clf.fit(x_train,y_train)
    
y_predict = my_clf.predict(x_test)

验证算法的准确率:

score = numpy.sum(y_predict==y_test)/len(y_test)

1465169-20190713094640329-622572258.png

封装,实现解耦:

# metrics.py   (metrics 意为衡量标准)
import numpy
import math

def accuracy_score(y_true, y_predict):
    '''计算准确率'''
    assert y_true.shape[0] == y_predict.shape[0], \
        "size of y_true must be equal to the size of y_predict"

    return numpy.sum(y_true == y_predict)/len(y_true)
    
# KNN.py     添加求准确率方法
from .metrics import accuracy_score

def score(self,x_test,y_test):
    y_predict = self.predict(x_test)
    return accuracy_score(y_test, y_predict)

转载于:https://www.cnblogs.com/shuai-long/p/11179490.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值