KNN算法的封装实现

1 KNN算法

基本思想:对每一个测试样本,寻找训练集中k个距离最接近的样本,将出现次数最多的类别分类为当前样本类别。

1.1 对单个样本实现knn

#导入库
import numpy as np 
import matplotlib.pyplot as plt
from collections import Counter
import math
#构建模拟数据集
raw_data_X = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679179110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745051997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783481, 3.424088941],
              [7.939820817, 0.791637231]
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
k=5#邻居数取5
X_train=np.array(raw_data_X)
Y_train=np.array(raw_data_y)
x=np.array([5.2,3.2])#测试数据

使用散点图绘制模拟数据。

plt.scatter(X_train[Y_train==0,0],X_train[Y_train==0,1])
plt.scatter(X_train[Y_train==1,0],X_train[Y_train==1,1])
plt.scatter(x[0],x[1],color='r')
plt.draw()

在这里插入图片描述
计算测试数据与各个训练集样本中的距离。

distance=[math.sqrt(np.sum((x-x_train)**2)) for x_train in X_train]#计算距离
distance

在这里插入图片描述
获取前k个距离最近的样本类别标签。

nearest=np.argsort(distance)#按距离从小到大排列距离数组的索引
topK_y=[Y_train[index] for index in nearest[:k]]#取前k个最近的样本标签
topK_y
输出:[1, 0, 0, 0, 1]

使用collections中的Counter类统计各个类别出现次数。

counter=Counter(topK_y)
counter
输出:Counter({1: 2, 0: 3})

获取出现次数最多的前n个类别,此处n当然取1 。

predict_y=counter.most_common(1)[0][0]
print('KNN将x分类为:',predict_y)
输出:KNN将x分类为: 0

1.2 对多个样本实现knn

仿照sklean接口对代码进行整理,实现对多个样本进行knn分类。

import numpy as np
import math
from collections import Counter

class KNNClasifier():
    def __init__(self,k):
        """初始化KNN模型"""
        assert k>=1,'k must be valid'
        self.k=k
        self._X_train=None#私有成员
        self._Y_train=None

    def fit(self,X_train,Y_train):
        """使用训练数据训练KNN模型"""
        assert X_train.shape[0]==Y_train.shape[0],\
            'the shape of X_train must equal to Y_train'
        assert self.k<=X_train.shape[0],\
            'the size of X_train must be at least k'
        self._X_train=X_train
        self._Y_train=Y_train
        return self#sklearn规范

    def predict(self,X_predict):
        """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
        assert self._X_train is not None and self._Y_train is not None,\
            'must fit before predict!'
        assert X_predict.shape[1]==self._X_train.shape[1],\
            'the feature number of X_predict must equal to X_train'
        return [self._predict(x_predict) for x_predict in X_predict]

    def _predict(self,x_predict):
        """给定单个待测试数据x_predict,返回x_predict类别结果"""
        assert x_predict.shape[0]==self._X_train.shape[1],\
            'the feature number of x_predict must equal to X_train'
        distance=[math.sqrt(np.sum((x_train-x_predict)**2)) for x_train in self._X_train]
        nearest=np.argsort(distance)
        topK_y=[self._Y_train[index] for index in nearest[:self.k]]
        voter=Counter(topK_y)
        return voter.most_common(1)[0][0]

    def __repr__(self):
        return 'KNN(k=%d)' % self.k

if __name__=='__main__':
    # 构建模拟数据集
    raw_data_X = [[3.393533211, 2.331273381],
                  [3.110073483, 1.781539638],
                  [1.343808831, 3.368360954],
                  [3.582294042, 4.679179110],
                  [2.280362439, 2.866990263],
                  [7.423436942, 4.696522875],
                  [5.745051997, 3.533989803],
                  [9.172168622, 2.511101045],
                  [7.792783481, 3.424088941],
                  [7.939820817, 0.791637231]
                  ]
    raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
    k = 5  # 邻居数取5
    X_train = np.array(raw_data_X)
    Y_train = np.array(raw_data_y)
    x = np.array([5.2, 3.2])  # 测试数据
    knn=KNNClasifier(5)
    print(knn)
    knn.fit(X_train,Y_train)
    print(knn.predict(x.reshape(1,-1)))

1.3 sklearn中的KNN

from sklearn.neighbors import KNeighborsClassifier
import numpy as np

if __name__=='__main__':
    # 构建模拟数据集
    raw_data_X = [[3.393533211, 2.331273381],
                  [3.110073483, 1.781539638],
                  [1.343808831, 3.368360954],
                  [3.582294042, 4.679179110],
                  [2.280362439, 2.866990263],
                  [7.423436942, 4.696522875],
                  [5.745051997, 3.533989803],
                  [9.172168622, 2.511101045],
                  [7.792783481, 3.424088941],
                  [7.939820817, 0.791637231]
                  ]
    raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
    k = 5  # 邻居数取5
    X_train = np.array(raw_data_X)
    Y_train = np.array(raw_data_y)
    x = np.array([5.2, 3.2])  # 测试数据
    knn=KNeighborsClassifier(n_neighbors=5)
    print(knn)
    knn.fit(X_train,Y_train)
    print(knn.predict(x.reshape(1,-1)))
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值